diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index 84a2673fecb5b..4383249658e60 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass enum class ComplexDeinterleavingOperation { CAdd, CMulPartial, + CDot, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, @@ -43,6 +44,7 @@ enum class ComplexDeinterleavingOperation { ReductionPHI, ReductionOperation, ReductionSelect, + ReductionSingle }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index f3f7ea9407b46..3111354addacd 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -108,6 +108,13 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { +template +std::optional findCommonBetweenCollections(IterT A, IterT B) { + auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); }); + if (Common != A.end()) + return std::make_optional(*Common); + return std::nullopt; +} class ComplexDeinterleavingLegacyPass : public FunctionPass { public: @@ -144,6 +151,7 @@ struct ComplexDeinterleavingCompositeNode { friend class ComplexDeinterleavingGraph; using NodePtr = std::shared_ptr; using RawNodePtr = ComplexDeinterleavingCompositeNode *; + bool OperandsValid = true; public: ComplexDeinterleavingOperation Operation; @@ -160,7 +168,11 @@ struct ComplexDeinterleavingCompositeNode { SmallVector Operands; Value *ReplacementNode = nullptr; - void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + void addOperand(NodePtr Node) { + if (!Node || !Node.get()) + OperandsValid = false; + Operands.push_back(Node.get()); + } void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { @@ -194,6 +206,8 @@ struct ComplexDeinterleavingCompositeNode { PrintNodeRef(Op); } } + + bool areOperandsValid() { return OperandsValid; } }; class ComplexDeinterleavingGraph { @@ -293,7 +307,7 @@ class ComplexDeinterleavingGraph { NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - if (Node->Real && Node->Imag) + if (Node->Real) CachedResult[{Node->Real, Node->Imag}] = Node; return Node; } @@ -327,6 +341,8 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifyPartialReduction(Value *R, Value *I); + NodePtr identifyDotProduct(Value *Inst); NodePtr identifyNode(Value *R, Value *I); @@ -396,6 +412,7 @@ class ComplexDeinterleavingGraph { /// * Deinterleave the final value outside of the loop and repurpose original /// reduction users void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); + void processReductionSingle(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -891,17 +908,163 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); - assert(R->getType() == I->getType() && - "Real and imaginary parts should not have different types"); +ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CDot, V->getType())) { + LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " + "operation CDot with the type " + << *V->getType() << "\n"); + return nullptr; + } + + auto *Inst = cast(V); + auto *RealUser = cast(*Inst->user_begin()); + + NodePtr CN = + prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); + + NodePtr ANode; + + const Intrinsic::ID PartialReduceInt = + Intrinsic::experimental_vector_partial_reduce_add; + + Value *AReal = nullptr; + Value *AImag = nullptr; + Value *BReal = nullptr; + Value *BImag = nullptr; + Value *Phi = nullptr; + + auto UnwrapCast = [](Value *V) -> Value * { + if (auto *CI = dyn_cast(V)) + return CI->getOperand(0); + return V; + }; + + auto PatternRot0 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(AReal))), + m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); + + auto PatternRot270 = m_Intrinsic( + m_Intrinsic( + m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), + m_Mul(m_Value(BImag), m_Value(AReal))); + + if (match(Inst, PatternRot0)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_0; + } else if (match(Inst, PatternRot270)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_270; + } else { + Value *A0, *A1; + // The rotations 90 and 180 share the same operation pattern, so inspect the + // order of the operands, identifying where the real and imaginary + // components of A go, to discern between the aforementioned rotations. + auto PatternRot90Rot180 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(A0))), + m_Mul(m_Value(BImag), m_Value(A1))); + + if (!match(Inst, PatternRot90Rot180)) + return nullptr; + + A0 = UnwrapCast(A0); + A1 = UnwrapCast(A1); + + // Test if A0 is real/A1 is imag + ANode = identifyNode(A0, A1); + if (!ANode) { + // Test if A0 is imag/A1 is real + ANode = identifyNode(A1, A0); + // Unable to identify operand components, thus unable to identify rotation + if (!ANode) + return nullptr; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_90; + AReal = A1; + AImag = A0; + } else { + AReal = A0; + AImag = A1; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_180; + } + } + + AReal = UnwrapCast(AReal); + AImag = UnwrapCast(AImag); + BReal = UnwrapCast(BReal); + BImag = UnwrapCast(BImag); + + VectorType *VTy = cast(V->getType()); + Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2); + if (AReal->getType() != ExpectedOperandTy) + return nullptr; + if (AImag->getType() != ExpectedOperandTy) + return nullptr; + if (BReal->getType() != ExpectedOperandTy) + return nullptr; + if (BImag->getType() != ExpectedOperandTy) + return nullptr; + + if (Phi->getType() != VTy && RealUser->getType() != VTy) + return nullptr; + + NodePtr Node = identifyNode(AReal, AImag); + + // In the case that a node was identified to figure out the rotation, ensure + // that trying to identify a node with AReal and AImag post-unwrap results in + // the same node + if (ANode && Node != ANode) { + LLVM_DEBUG( + dbgs() + << "Identified node is different from previously identified node. " + "Unable to confidently generate a complex operation node\n"); + return nullptr; + } + + CN->addOperand(Node); + CN->addOperand(identifyNode(BReal, BImag)); + CN->addOperand(identifyNode(Phi, RealUser)); + + return submitCompositeNode(CN); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { + // Partial reductions don't support non-vector types, so check these first + if (!isa(R->getType()) || !isa(I->getType())) + return nullptr; + + auto CommonUser = + findCommonBetweenCollections(R->users(), I->users()); + if (!CommonUser) + return nullptr; + + auto *IInst = dyn_cast(*CommonUser); + if (!IInst || IInst->getIntrinsicID() != + Intrinsic::experimental_vector_partial_reduce_add) + return nullptr; + + if (NodePtr CN = identifyDotProduct(IInst)) + return CN; + return nullptr; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return It->second; } + if (NodePtr CN = identifyPartialReduction(R, I)) + return CN; + + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); + if (!IsReduction && R->getType() != I->getType()) + return nullptr; + if (NodePtr CN = identifySplat(R, I)) return CN; @@ -1427,12 +1590,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { if (It != RootToNode.end()) { auto RootNode = It->second; assert(RootNode->Operation == - ComplexDeinterleavingOperation::ReductionOperation); + ComplexDeinterleavingOperation::ReductionOperation || + RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. auto *R = cast(RootNode->Real); - auto *I = cast(RootNode->Imag); - auto *ReplacementAnchor = R->comesBefore(I) ? I : R; + auto *I = RootNode->Imag ? cast(RootNode->Imag) : nullptr; + + Instruction *ReplacementAnchor; + if (I) + ReplacementAnchor = R->comesBefore(I) ? I : R; + else + ReplacementAnchor = R; + if (ReplacementAnchor != RootI) return false; OrderedRoots.push_back(RootI); @@ -1523,7 +1694,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - auto *Real = OperationInstruction[i]; auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) @@ -1556,6 +1726,28 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { break; } } + + auto *Real = OperationInstruction[i]; + // We want to check that we have 2 operands, but the function attributes + // being counted as operands bloats this value. + if (Real->getNumOperands() < 2) + continue; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = nullptr; + PHIsFound = false; + auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); + if (Node && PHIsFound) { + LLVM_DEBUG( + dbgs() << "Identified single reduction starting from instruction: " + << *Real << "/" << *ReductionInfo[Real].second << "\n"); + Processed[i] = true; + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + submitCompositeNode(RootNode); + } } RealPHI = nullptr; @@ -1563,6 +1755,12 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } bool ComplexDeinterleavingGraph::checkNodes() { + + for (NodePtr N : CompositeNodes) { + if (!N->areOperandsValid()) + return false; + } + // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; SmallVector Worklist; @@ -1831,7 +2029,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { - if (Real != RealPHI || Imag != ImagPHI) + if (Real != RealPHI || (ImagPHI && Imag != ImagPHI)) return nullptr; PHIsFound = true; @@ -1926,6 +2124,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, Value *ReplacementNode; switch (Node->Operation) { + case ComplexDeinterleavingOperation::CDot: { + Value *Input0 = ReplaceOperandIfExist(Node, 0); + Value *Input1 = ReplaceOperandIfExist(Node, 1); + Value *Accumulator = ReplaceOperandIfExist(Node, 2); + assert(!Input1 || (Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type")); + ReplacementNode = TL->createComplexDeinterleavingIR( + Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + break; + } case ComplexDeinterleavingOperation::CAdd: case ComplexDeinterleavingOperation::CMulPartial: case ComplexDeinterleavingOperation::Symmetric: { @@ -1969,13 +2177,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. + auto *OldPHI = cast(Node->Real); auto *VTy = cast(Node->Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt()); - OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; + OldToNewPHI[OldPHI] = NewPHI; ReplacementNode = NewPHI; break; } + case ComplexDeinterleavingOperation::ReductionSingle: + ReplacementNode = replaceNode(Builder, Node->Operands[0]); + processReductionSingle(ReplacementNode, Node); + break; case ComplexDeinterleavingOperation::ReductionOperation: ReplacementNode = replaceNode(Builder, Node->Operands[0]); processReductionOperation(ReplacementNode, Node); @@ -2000,6 +2213,38 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, return ReplacementNode; } +void ComplexDeinterleavingGraph::processReductionSingle( + Value *OperationReplacement, RawNodePtr Node) { + auto *Real = cast(Node->Real); + auto *OldPHI = ReductionInfo[Real].first; + auto *NewPHI = OldToNewPHI[OldPHI]; + auto *VTy = cast(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + Value *Init = OldPHI->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + + Value *NewInit = nullptr; + if (auto *C = dyn_cast(Init)) { + if (C->isZeroValue()) + NewInit = Constant::getNullValue(NewVTy); + } + + if (!NewInit) + NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, + {Init, Constant::getNullValue(VTy)}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(OperationReplacement, BackEdge); + + auto *FinalReduction = ReductionInfo[Real].second; + Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt()); + + auto *AddReduce = Builder.CreateAddReduce(OperationReplacement); + FinalReduction->replaceAllUsesWith(AddReduce); +} + void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); @@ -2059,8 +2304,13 @@ void ComplexDeinterleavingGraph::replaceNodes() { auto *RootImag = cast(RootNode->Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); - DeadInstrRoots.push_back(cast(RootReal)); - DeadInstrRoots.push_back(cast(RootImag)); + DeadInstrRoots.push_back(RootReal); + DeadInstrRoots.push_back(RootImag); + } else if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle) { + auto *RootInst = cast(RootNode->Real); + ReductionInfo[RootInst].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(ReductionInfo[RootInst].second); } else { assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ed2d9a07cec63..423395a994587 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29408,9 +29408,16 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); + + if (Operation == ComplexDeinterleavingOperation::CDot) + return ScalarWidth == 32 || ScalarWidth == 64; return 8 <= ScalarWidth && ScalarWidth <= 64; } + // CDot is not supported outside of scalable/sve scopes + if (Operation == ComplexDeinterleavingOperation::CDot) + return false; + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } @@ -29420,6 +29427,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); + if (Accumulator == nullptr) + Accumulator = Constant::getNullValue(Ty); bool IsScalable = Ty->isScalableTy(); bool IsInt = Ty->getElementType()->isIntegerTy(); @@ -29431,6 +29440,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; + int AccStride = cast(Accumulator->getType()) + ->getElementCount() + .getKnownMinValue() / + 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); @@ -29440,25 +29453,26 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; - if (Accumulator) { - LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); - UpperSplitAcc = - B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); - } + Type *FullTy = Ty; + FullTy = Accumulator->getType(); + auto *HalfAccTy = VectorType::getHalfElementsVectorType( + cast(Accumulator->getType())); + LowerSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); + UpperSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, - B.getInt64(0)); - return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); + auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), + LowerSplitInt, B.getInt64(0)); + return B.CreateInsertVector(FullTy, Result, UpperSplitInt, + B.getInt64(AccStride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { - if (Accumulator == nullptr) - Accumulator = Constant::getNullValue(Ty); - if (IsScalable) { if (IsInt) return B.CreateIntrinsic( @@ -29510,6 +29524,13 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } + if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && + IsScalable) { + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cdot, Accumulator->getType(), + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } + return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll new file mode 100644 index 0000000000000..11cf4c31936d8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -0,0 +1,1136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE +; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp_i8_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i64 @cdotp_i16_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + + +define i32 @not_cdotp( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i16 @invalid_type( %a, %b) { +; CHECK-SVE2-LABEL: define i16 @invalid_type( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i16 [[TMP0]] +; +; CHECK-SVE-LABEL: define i16 @invalid_type( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i16 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i16 @invalid_type( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i16 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i16 @llvm.vector.reduce.add.nxv8i16( %partial.reduce.sub) + ret i16 %0 +} + +define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a) + %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b) + %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0 + %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1 + %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0 + %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1 + %a.real.ext = sext <16 x i8> %a.real to <16 x i32> + %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32> + %b.real.ext = sext <16 x i8> %b.real to <16 x i32> + %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32> + %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext + %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul) + %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul + %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub) + ret i32 %0 +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) + +declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64()