From 252567a8223e4acf1179be01a4b5b5a88ae4607f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 11 Jan 2024 15:48:08 +0000 Subject: [PATCH 01/23] [SLP] Initial vectorization of non-power-of-2 ops. This patch enables vectorization for non-power-of-2 VFs. Initially only VFs where adding 1 makes the VF a power-of-of-2, i.e. we can still make relatively effective use of the vectors. It relies on the existing target cost-models to return accurate costs for non-power-of-2 vectors. I checked mostly AArch64 and X86 and there the costs seem reasonable for the costs I checked, although I expect there will be a need to refine both the cost-models and lowering to make most effective use of non-power-of-2 SLP vectorization. Note that re-ordering and shuffling is not implemented for nodes requiring padding yet to keep the initial implementation simpler. The feature is guarded by a new flag, off by defaul for now. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 108 +++++- .../SLPVectorizer/AArch64/vec15-base.ll | 70 ++-- .../SLPVectorizer/AArch64/vec3-base.ll | 247 +++++++++---- .../SLPVectorizer/AArch64/vec3-calls.ll | 3 +- .../AArch64/vec3-reorder-reshuffle.ll | 305 +++++++++++++--- .../Transforms/SLPVectorizer/X86/odd_store.ll | 66 ++-- .../X86/vect_copyable_in_binops.ll | 343 ++++++++++-------- 7 files changed, 801 insertions(+), 341 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index bde65717ac1d46..4ac010e81e9476 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -179,6 +179,10 @@ static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); +static cl::opt VectorizeNonPowerOf2( + "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, + cl::desc("Try to vectorize with non-power-of-2 with number of elements.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -2733,6 +2737,9 @@ class BoUpSLP { SmallVectorImpl *OpScalars = nullptr, SmallVectorImpl *AltScalars = nullptr) const; + /// Return the number of padding lanes (containg poison) for this node. + bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -2891,9 +2898,13 @@ class BoUpSLP { ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); - + if (!isPowerOf2_32(Last->Scalars.size())) { + assert((Last->ReorderIndices.empty()) && + "Reodering isn't implemented for nodes with padding yet"); + } + } return Last; } @@ -3904,6 +3915,9 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); + if (!Order.empty() && !isPowerOf2_32(VL.size())) + return LoadsState::Gather; + if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { @@ -4593,6 +4607,10 @@ bool BoUpSLP::canReorderOperands( TreeEntry *UserTE, SmallVectorImpl> &Edges, ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { + // Reordering isn't implemented for nodes with padding yet. + if (UserTE->isNonPowOf2Vec()) + return false; + for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { if (any_of(Edges, [I](const std::pair &OpData) { return OpData.first == I && @@ -4771,6 +4789,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( const TreeEntry *TE) { + // Reordering for nodes with padding not implemented yet. + if (TE->isNonPowOf2Vec()) + return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || (IgnoreReorder && TE->Idx == 0)) @@ -5609,6 +5630,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (PWSz == VL.size()) { ReuseShuffleIndicies.clear(); } else { + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " + "for nodes with padding.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return false; + } NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); NonUniqueValueVL.append(PWSz - UniqueValues.size(), UniqueValues.back()); @@ -5620,6 +5647,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; } + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for " + "nodes with padding.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return false; + } VL = UniqueValues; } return true; @@ -6376,6 +6409,10 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, SmallVectorImpl &CurrentOrder, bool ResizeAllowed) const { + // TODO: Reusing extracts is not supported yet for non-power-of-2 ops. + if (!isPowerOf2_32(VL.size())) + return false; + const auto *It = find_if(VL, [](Value *V) { return isa(V); }); @@ -6987,6 +7024,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL.begin(), VL.end()); + auto ComputeGatherCost = [&]() { + return all_of(Gathers, UndefValue::classof) + ? TTI::TCC_Free + : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)); + }; + + // TODO: Only full gather is supported for non-power-of-2 operations for + // now. + if (!isPowerOf2_32(VL.size())) + return ComputeGatherCost(); + // Improve gather cost for gather of loads, if we can group some of the // loads into vector loads. InstructionsState S = getSameOpcode(VL, *R.TLI); @@ -9741,6 +9789,9 @@ BoUpSLP::isGatherShuffledEntry( // No need to check for the topmost gather node. if (TE == VectorizableTree.front().get()) return {}; + // Gathering for nodes with padding is not implemented yet. + if (TE->isNonPowOf2Vec()) + return {}; Mask.assign(VL.size(), PoisonMaskElem); assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); @@ -10532,7 +10583,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); reorderScalars(VL, Mask); } - const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { @@ -10574,6 +10624,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, ShuffleBuilder.add(V, Mask); return ShuffleBuilder.finalize(std::nullopt); }; + const unsigned VF = VL.size(); Value *V = vectorizeTree(VE, PostponedPHIs); if (VF != cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { @@ -10653,7 +10704,16 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, template ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); + unsigned VF = E->getVectorFactor(); + BVTy ShuffleBuilder(Params...); + if (E->isNonPowOf2Vec()) { + Value *BV = ShuffleBuilder.gather(E->Scalars); + SmallVector Mask(VF, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0); + ShuffleBuilder.add(BV, Mask); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + } bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), @@ -10699,7 +10759,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } return true; }; - BVTy ShuffleBuilder(Params...); ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask(GatheredScalars.size(), PoisonMaskElem); @@ -13480,8 +13539,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, const unsigned Sz = R.getVectorElementSize(Chain[0]); unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) - return false; + if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) { + // Check if vectorizing with a non-power-of-2 VF should be considered. At + // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost + // all vector lanes are used. + if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) + return false; + } LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx << "\n"); @@ -13577,9 +13641,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, << "MinVF (" << MinVF << ")\n"); } + unsigned StartIdx = 0; + if (VectorizeNonPowerOf2) { + // Try vectorizing with a non-power-of-2 VF. At the moment, only + // consider cases where VF + 1 is a power-of-2, i.e. almost all vector + // lanes are used. + unsigned CandVF = Operands.size() + 1; + if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) { + assert( + all_of( + Operands, + [&](Value *V) { + return cast(V)->getValueOperand()->getType() == + cast(Operands.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!VectorizedStores.count(Operands.front()) && + !VectorizedStores.count(Operands.back()) && + TriedSequences + .insert(std::make_pair(Operands.front(), Operands.back())) + .second && + vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + StartIdx += Operands.size(); + } + } + } + // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - unsigned StartIdx = 0; for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll index b9e959d50befdd..7b27489782fc46 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll @@ -1,35 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) { -; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store( -; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], -; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1 -; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8 -; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], -; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1 -; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12 -; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4 -; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10 -; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12 -; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1 -; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13 -; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4 -; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10 -; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13 -; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1 -; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14 -; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4 -; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10 -; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14 -; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store( +; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], +; NON-POW2-NEXT: store <15 x i8> [[TMP1]], ptr [[DST]], align 1 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store( +; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], +; POW2-ONLY-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1 +; POW2-ONLY-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8 +; POW2-ONLY-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], +; POW2-ONLY-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1 +; POW2-ONLY-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12 +; POW2-ONLY-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4 +; POW2-ONLY-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10 +; POW2-ONLY-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12 +; POW2-ONLY-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1 +; POW2-ONLY-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13 +; POW2-ONLY-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4 +; POW2-ONLY-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10 +; POW2-ONLY-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13 +; POW2-ONLY-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1 +; POW2-ONLY-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14 +; POW2-ONLY-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4 +; POW2-ONLY-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10 +; POW2-ONLY-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14 +; POW2-ONLY-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1 +; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0 @@ -123,5 +133,3 @@ entry: ret void } - - diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll index 59ffbf7ef9b247..c18811a35c1eeb 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll @@ -1,16 +1,69 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) { -; CHECK-LABEL: @v3_load_i32_mul_by_constant_store( +; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], +; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], +; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 +; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void +; +entry: + %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 + %l.src.0 = load i32, ptr %gep.src.0, align 4 + %mul.0 = mul nsw i32 %l.src.0, 10 + + %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 + %l.src.1 = load i32, ptr %gep.src.1, align 4 + %mul.1 = mul nsw i32 %l.src.1, 10 + + %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %mul.2 = mul nsw i32 %l.src.2, 10 + + store i32 %mul.0, ptr %dst + + %dst.1 = getelementptr i32, ptr %dst, i32 1 + store i32 %mul.1, ptr %dst.1 + + %dst.2 = getelementptr i32, ptr %dst, i32 2 + store i32 %mul.2, ptr %dst.2 + + ret void +} + +; Should no be vectorized with a undef/poison element as padding, as division by undef/poison may cause UB. +define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) { +; CHECK-LABEL: @v3_load_i32_udiv_by_constant_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]] +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1 +; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]] ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2 ; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]] +; CHECK-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1 +; CHECK-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4 ; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 ; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 ; CHECK-NEXT: ret void @@ -18,15 +71,15 @@ define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) { entry: %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0 %l.src.0 = load i32, ptr %gep.src.0, align 4 - %mul.0 = mul nsw i32 %l.src.0, 10 + %mul.0 = udiv i32 10, %l.src.0 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1 %l.src.1 = load i32, ptr %gep.src.1, align 4 - %mul.1 = mul nsw i32 %l.src.1, 10 + %mul.1 = udiv i32 10, %l.src.1 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2 %l.src.2 = load i32, ptr %gep.src.2, align 4 - %mul.2 = mul nsw i32 %l.src.2, 10 + %mul.2 = udiv i32 10, %l.src.2 store i32 %mul.0, ptr %dst @@ -39,23 +92,35 @@ entry: ret void } + + define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) { -; CHECK-LABEL: @v3_load_i32_mul_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 -; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 -; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 -; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] -; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 -; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @v3_load_i32_mul_store( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] +; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @v3_load_i32_mul_store( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] +; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 +; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 @@ -88,24 +153,35 @@ entry: } define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) { -; CHECK-LABEL: @v3_load_i32_mul_add_const_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 -; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 -; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 -; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 -; CHECK-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]] +; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], +; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]] +; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]] +; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], +; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2 +; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0 @@ -141,18 +217,26 @@ entry: } define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) { -; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 -; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 -; CHECK-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], -; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 -; CHECK-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], +; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], +; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 +; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 @@ -179,18 +263,28 @@ entry: } define void @phi_store3(ptr %dst) { -; CHECK-LABEL: @phi_store3( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[EXIT:%.*]] -; CHECK: invoke.cont8.loopexit: -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ] -; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2 -; CHECK-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4 -; CHECK-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @phi_store3( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: br label [[EXIT:%.*]] +; NON-POW2: invoke.cont8.loopexit: +; NON-POW2-NEXT: br label [[EXIT]] +; NON-POW2: exit: +; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ , [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] +; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @phi_store3( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: br label [[EXIT:%.*]] +; POW2-ONLY: invoke.cont8.loopexit: +; POW2-ONLY-NEXT: br label [[EXIT]] +; POW2-ONLY: exit: +; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ] +; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ] +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2 +; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4 +; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: br label %exit @@ -213,13 +307,18 @@ exit: } define void @store_try_reorder(ptr %dst) { -; CHECK-LABEL: @store_try_reorder( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0 -; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @store_try_reorder( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @store_try_reorder( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 +; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %add = add i32 0, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 2cb84eeb7fc8f4..67746f2cbf5d22 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s define void @vec3_vectorize_call(ptr %Colour, float %0) { ; CHECK-LABEL: @vec3_vectorize_call( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 5707e143ad5515..60a353943eed1b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s %struct.zot = type { i32, i32, i32 } @@ -172,32 +173,93 @@ entry: } define i32 @reorder_indices_1(float %0) { -; CHECK-LABEL: define i32 @reorder_indices_1( -; CHECK-SAME: float [[TMP0:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; CHECK-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] -; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 -; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 -; CHECK-NEXT: ret i32 0 +; PADDING-LABEL: define i32 @reorder_indices_1( +; PADDING-SAME: float [[TMP0:%.*]]) { +; PADDING-NEXT: entry: +; PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; PADDING-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 +; PADDING-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; PADDING-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 +; PADDING-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 +; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] +; PADDING-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) +; PADDING-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) +; PADDING-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer +; PADDING-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 +; PADDING-NEXT: ret i32 0 +; +; NO-PADDING-LABEL: define i32 @reorder_indices_1( +; NO-PADDING-SAME: float [[TMP0:%.*]]) { +; NO-PADDING-NEXT: entry: +; NO-PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; NO-PADDING-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 +; NO-PADDING-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; NO-PADDING-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 +; NO-PADDING-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; NO-PADDING-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] +; NO-PADDING-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] +; NO-PADDING-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) +; NO-PADDING-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; NO-PADDING-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; NO-PADDING-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] +; NO-PADDING-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; NO-PADDING-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer +; NO-PADDING-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] +; NO-PADDING-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; NO-PADDING-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) +; NO-PADDING-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) +; NO-PADDING-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; NO-PADDING-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer +; NO-PADDING-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 +; NO-PADDING-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; NO-PADDING-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 +; NO-PADDING-NEXT: ret i32 0 +; +; POW2-ONLY-LABEL: define i32 @reorder_indices_1( +; POW2-ONLY-SAME: float [[TMP0:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] +; POW2-ONLY-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) +; POW2-ONLY-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer +; POW2-ONLY-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: ret i32 0 +; +; NON-POW2-LABEL: define i32 @reorder_indices_1( +; NON-POW2-SAME: float [[TMP0:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; NON-POW2-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 +; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] +; NON-POW2-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] +; NON-POW2-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) +; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; NON-POW2-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] +; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] +; NON-POW2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; NON-POW2-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) +; NON-POW2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; NON-POW2-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer +; NON-POW2-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 +; NON-POW2-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; NON-POW2-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 +; NON-POW2-NEXT: ret i32 0 ; entry: %nor1 = alloca [0 x [3 x float]], i32 0, align 4 @@ -228,19 +290,63 @@ entry: } define void @reorder_indices_2(ptr %spoint) { -; CHECK-LABEL: define void @reorder_indices_2( -; CHECK-SAME: ptr [[SPOINT:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 -; CHECK-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 -; CHECK-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 -; CHECK-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 -; CHECK-NEXT: ret void +; PADDING-LABEL: define void @reorder_indices_2( +; PADDING-SAME: ptr [[SPOINT:%.*]]) { +; PADDING-NEXT: entry: +; PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 +; PADDING-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 +; PADDING-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 +; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 +; PADDING-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer +; PADDING-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 +; PADDING-NEXT: ret void +; +; NO-PADDING-LABEL: define void @reorder_indices_2( +; NO-PADDING-SAME: ptr [[SPOINT:%.*]]) { +; NO-PADDING-NEXT: entry: +; NO-PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; NO-PADDING-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) +; NO-PADDING-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 +; NO-PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; NO-PADDING-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; NO-PADDING-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer +; NO-PADDING-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 +; NO-PADDING-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 +; NO-PADDING-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 +; NO-PADDING-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reorder_indices_2( +; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer +; POW2-ONLY-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 +; POW2-ONLY-NEXT: ret void +; +; NON-POW2-LABEL: define void @reorder_indices_2( +; NON-POW2-SAME: ptr [[SPOINT:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) +; NON-POW2-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 +; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; NON-POW2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer +; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 +; NON-POW2-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 +; NON-POW2-NEXT: ret void ; entry: %0 = extractelement <3 x float> zeroinitializer, i64 1 @@ -292,19 +398,55 @@ entry: } define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) { -; CHECK-LABEL: define void @reuse_shuffle_indidces_1( -; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 -; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 -; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 -; CHECK-NEXT: ret void +; PADDING-LABEL: define void @reuse_shuffle_indidces_1( +; PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; PADDING-NEXT: entry: +; PADDING-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 +; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 +; PADDING-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer +; PADDING-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer +; PADDING-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 +; PADDING-NEXT: ret void +; +; NO-PADDING-LABEL: define void @reuse_shuffle_indidces_1( +; NO-PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NO-PADDING-NEXT: entry: +; NO-PADDING-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; NO-PADDING-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 +; NO-PADDING-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer +; NO-PADDING-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer +; NO-PADDING-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 +; NO-PADDING-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 +; NO-PADDING-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; NO-PADDING-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 +; NO-PADDING-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 +; NO-PADDING-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1( +; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer +; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer +; POW2-ONLY-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 +; POW2-ONLY-NEXT: ret void +; +; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1( +; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 +; NON-POW2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer +; NON-POW2-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer +; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 +; NON-POW2-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 +; NON-POW2-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; NON-POW2-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 +; NON-POW2-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 +; NON-POW2-NEXT: ret void ; entry: %mul24 = fmul float %1, 0.000000e+00 @@ -513,4 +655,61 @@ entry: ret void } +define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) { +; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( +; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]] +; POW2-ONLY-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> , <3 x float> ) +; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], +; POW2-ONLY-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4 +; POW2-ONLY-NEXT: ret void +; +; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding( +; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2 +; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 +; NON-POW2-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]] +; NON-POW2-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00) +; NON-POW2-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> +; NON-POW2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> , <2 x float> ) +; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], +; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4 +; NON-POW2-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4 +; NON-POW2-NEXT: ret void +; +entry: + %arrayidx42.i = getelementptr float, ptr %A, i64 2 + %arrayidx35.i = getelementptr float, ptr %A, i64 1 + %0 = extractelement <3 x float> %in, i64 0 + %1 = extractelement <3 x float> %in, i64 0 + %sub.i362 = fsub float %0, %1 + %2 = extractelement <3 x float> %in, i64 1 + %3 = extractelement <3 x float> %in, i64 1 + %sub5.i = fsub float %2, %3 + %4 = extractelement <3 x float> %in, i64 2 + %5 = extractelement <3 x float> %in, i64 2 + %sub9.i = fsub float %4, %5 + %6 = call float @llvm.fmuladd.f32(float %sub5.i, float 2.000000e+00, float 3.000000e+00) + %7 = call float @llvm.fmuladd.f32(float %sub9.i, float 2.000000e+00, float 3.000000e+00) + %8 = call float @llvm.fmuladd.f32(float %sub.i362, float 2.000000e+00, float 3.000000e+00) + %mul.i.i.i.i373 = fmul float %6, 3.000000e+00 + %mul3.i.i.i.i = fmul float %7, 3.000000e+00 + %mul6.i.i.i.i = fmul float %8, 3.000000e+00 + store float %mul.i.i.i.i373, ptr %A, align 4 + store float %mul3.i.i.i.i, ptr %arrayidx35.i, align 4 + store float %mul6.i.i.i.i, ptr %arrayidx42.i, align 4 + ret void +} + declare float @llvm.fmuladd.f32(float, float, float) +declare double @llvm.fmuladd.f64(double, double, double) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll index 4795ac65592037..853b4f396aaa50 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer,dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,POW2-ONLY %s ;int foo(char * restrict A, ptr restrict B, float T) { ; A[0] = (T * B[10] + 4.0); @@ -8,31 +9,44 @@ ;} define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double -; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8 -; CHECK-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11 -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]] -; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double -; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00 -; CHECK-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 -; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12 -; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]] -; CHECK-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double -; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00 -; CHECK-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2 -; CHECK-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1 -; CHECK-NEXT: ret i32 undef +; NON-POW2-LABEL: @foo( +; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[T]], i32 1 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[T]], i32 2 +; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP2]], [[TMP5]] +; NON-POW2-NEXT: [[TMP7:%.*]] = fpext <3 x float> [[TMP6]] to <3 x double> +; NON-POW2-NEXT: [[TMP8:%.*]] = fadd <3 x double> [[TMP7]], +; NON-POW2-NEXT: [[TMP9:%.*]] = fptosi <3 x double> [[TMP8]] to <3 x i8> +; NON-POW2-NEXT: store <3 x i8> [[TMP9]], ptr [[A:%.*]], align 1 +; NON-POW2-NEXT: ret i32 undef +; +; POW2-ONLY-LABEL: @foo( +; POW2-ONLY-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]] +; POW2-ONLY-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double +; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8 +; POW2-ONLY-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11 +; POW2-ONLY-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 +; POW2-ONLY-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]] +; POW2-ONLY-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double +; POW2-ONLY-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00 +; POW2-ONLY-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8 +; POW2-ONLY-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 +; POW2-ONLY-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1 +; POW2-ONLY-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12 +; POW2-ONLY-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4 +; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]] +; POW2-ONLY-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double +; POW2-ONLY-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00 +; POW2-ONLY-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8 +; POW2-ONLY-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2 +; POW2-ONLY-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1 +; POW2-ONLY-NEXT: ret i32 undef ; %1 = getelementptr inbounds float, ptr %B, i64 10 %2 = load float, ptr %1, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 22cd408cd6dc7f..e30cb76d53d928 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s define void @add0(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @add0( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -32,21 +33,32 @@ entry: } define void @add1(ptr noalias %dst, ptr noalias %src) { -; CHECK-LABEL: @add1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3 -; CHECK-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @add1( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 +; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 +; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 +; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = add nsw <3 x i32> [[TMP1]], +; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @add1( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 +; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 +; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4 +; POW2-ONLY-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 +; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1 @@ -81,9 +93,9 @@ define void @sub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -110,9 +122,9 @@ entry: define void @sub1(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -140,9 +152,9 @@ entry: define void @sub2(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -179,11 +191,11 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -212,17 +224,17 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[DST]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP6]], ptr [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 +; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3 ; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -252,15 +264,15 @@ define void @mul(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[DST]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP5]], -9 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 ; CHECK-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -286,21 +298,32 @@ entry: } define void @shl0(ptr noalias %dst, ptr noalias %src) { -; CHECK-LABEL: @shl0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP5]], 3 -; CHECK-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @shl0( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 +; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 +; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 +; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], +; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @shl0( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 +; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 +; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], +; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 +; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 +; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1 @@ -326,9 +349,9 @@ entry: define void @shl1(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @shl1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -356,9 +379,9 @@ entry: define void @add0f(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @add0f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -384,21 +407,32 @@ entry: } define void @add1f(ptr noalias %dst, ptr noalias %src) { -; CHECK-LABEL: @add1f( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @add1f( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 +; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 +; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 +; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @add1f( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 +; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 +; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], +; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 +; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 +; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1 @@ -433,9 +467,9 @@ define void @sub0f(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -462,9 +496,9 @@ entry: define void @sub1f(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub1f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -492,9 +526,9 @@ entry: define void @sub2f(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub2f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -531,11 +565,11 @@ define void @addsub0f(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -564,17 +598,17 @@ define void @addsub1f(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[DST]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP6]], ptr [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00 +; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00 ; CHECK-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -604,15 +638,15 @@ define void @mulf(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[DST]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -640,9 +674,9 @@ entry: define void @add0fn(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @add0fn( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -668,21 +702,32 @@ entry: } define void @add1fn(ptr noalias %dst, ptr noalias %src) { -; CHECK-LABEL: @add1fn( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @add1fn( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 +; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 +; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 +; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], +; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @add1fn( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 +; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4 +; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 +; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], +; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 +; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00 +; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1 @@ -717,9 +762,9 @@ define void @sub0fn(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], -; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -746,9 +791,9 @@ entry: define void @sub1fn(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub1fn( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -776,9 +821,9 @@ entry: define void @sub2fn(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub2fn( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], -; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -808,15 +853,15 @@ define void @mulfn(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[DST]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; From 0bb957bf61f9f5ed2f6c5805d9dd3f8721272962 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 26 Jan 2024 21:57:03 +0000 Subject: [PATCH 02/23] Remove stale PADDING check lines, fix POW2/NON-POW2 prefixes in test. --- .../AArch64/vec3-reorder-reshuffle.ll | 308 ++++++------------ 1 file changed, 105 insertions(+), 203 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 60a353943eed1b..e405d755237a7f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s %struct.zot = type { i32, i32, i32 } @@ -173,94 +173,50 @@ entry: } define i32 @reorder_indices_1(float %0) { -; PADDING-LABEL: define i32 @reorder_indices_1( -; PADDING-SAME: float [[TMP0:%.*]]) { -; PADDING-NEXT: entry: -; PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; PADDING-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; PADDING-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; PADDING-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] -; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 -; PADDING-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 -; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] -; PADDING-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) -; PADDING-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) -; PADDING-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer -; PADDING-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 -; PADDING-NEXT: ret i32 0 -; -; NO-PADDING-LABEL: define i32 @reorder_indices_1( -; NO-PADDING-SAME: float [[TMP0:%.*]]) { -; NO-PADDING-NEXT: entry: -; NO-PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; NO-PADDING-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 -; NO-PADDING-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 -; NO-PADDING-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 -; NO-PADDING-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; NO-PADDING-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] -; NO-PADDING-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] -; NO-PADDING-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; NO-PADDING-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; NO-PADDING-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> -; NO-PADDING-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] -; NO-PADDING-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; NO-PADDING-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer -; NO-PADDING-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; NO-PADDING-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; NO-PADDING-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; NO-PADDING-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; NO-PADDING-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; NO-PADDING-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; NO-PADDING-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; NO-PADDING-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 -; NO-PADDING-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 -; NO-PADDING-NEXT: ret i32 0 +; NON-POW2-LABEL: define i32 @reorder_indices_1( +; NON-POW2-SAME: float [[TMP0:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 +; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 +; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] +; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) +; NON-POW2-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 +; NON-POW2-NEXT: ret i32 0 ; ; POW2-ONLY-LABEL: define i32 @reorder_indices_1( ; POW2-ONLY-SAME: float [[TMP0:%.*]]) { ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> -; POW2-ONLY-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] -; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 -; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 -; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] -; POW2-ONLY-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) -; POW2-ONLY-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) -; POW2-ONLY-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer -; POW2-ONLY-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] +; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) +; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] +; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] +; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) +; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer +; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 +; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 ; POW2-ONLY-NEXT: ret i32 0 ; -; NON-POW2-LABEL: define i32 @reorder_indices_1( -; NON-POW2-SAME: float [[TMP0:%.*]]) { -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; NON-POW2-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 -; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; NON-POW2-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] -; NON-POW2-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] -; NON-POW2-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; NON-POW2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> -; NON-POW2-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] -; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer -; NON-POW2-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; NON-POW2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; NON-POW2-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; NON-POW2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; NON-POW2-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; NON-POW2-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; NON-POW2-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; NON-POW2-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 -; NON-POW2-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 -; NON-POW2-NEXT: ret i32 0 -; entry: %nor1 = alloca [0 x [3 x float]], i32 0, align 4 %arrayidx.i = getelementptr float, ptr %nor1, i64 1 @@ -290,64 +246,35 @@ entry: } define void @reorder_indices_2(ptr %spoint) { -; PADDING-LABEL: define void @reorder_indices_2( -; PADDING-SAME: ptr [[SPOINT:%.*]]) { -; PADDING-NEXT: entry: -; PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 -; PADDING-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 -; PADDING-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 -; PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 -; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 -; PADDING-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer -; PADDING-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 -; PADDING-NEXT: ret void -; -; NO-PADDING-LABEL: define void @reorder_indices_2( -; NO-PADDING-SAME: ptr [[SPOINT:%.*]]) { -; NO-PADDING-NEXT: entry: -; NO-PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 -; NO-PADDING-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) -; NO-PADDING-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 -; NO-PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; NO-PADDING-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; NO-PADDING-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer -; NO-PADDING-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 -; NO-PADDING-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 -; NO-PADDING-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 -; NO-PADDING-NEXT: ret void +; NON-POW2-LABEL: define void @reorder_indices_2( +; NON-POW2-SAME: ptr [[SPOINT:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 +; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 +; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 +; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2( ; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) { ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) +; POW2-ONLY-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 ; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 -; POW2-ONLY-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer -; POW2-ONLY-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 +; POW2-ONLY-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 ; POW2-ONLY-NEXT: ret void ; -; NON-POW2-LABEL: define void @reorder_indices_2( -; NON-POW2-SAME: ptr [[SPOINT:%.*]]) { -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) -; NON-POW2-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 -; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; NON-POW2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer -; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 -; NON-POW2-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 -; NON-POW2-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 -; NON-POW2-NEXT: ret void -; entry: %0 = extractelement <3 x float> zeroinitializer, i64 1 %1 = extractelement <3 x float> zeroinitializer, i64 2 @@ -398,56 +325,31 @@ entry: } define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) { -; PADDING-LABEL: define void @reuse_shuffle_indidces_1( -; PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; PADDING-NEXT: entry: -; PADDING-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 -; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 -; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 -; PADDING-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer -; PADDING-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer -; PADDING-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 -; PADDING-NEXT: ret void -; -; NO-PADDING-LABEL: define void @reuse_shuffle_indidces_1( -; NO-PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; NO-PADDING-NEXT: entry: -; NO-PADDING-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; NO-PADDING-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; NO-PADDING-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer -; NO-PADDING-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer -; NO-PADDING-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 -; NO-PADDING-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 -; NO-PADDING-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; NO-PADDING-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 -; NO-PADDING-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 -; NO-PADDING-NEXT: ret void +; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1( +; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 +; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer +; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 +; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1( ; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 -; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer -; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer -; POW2-ONLY-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer +; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 +; POW2-ONLY-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 +; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 ; POW2-ONLY-NEXT: ret void ; -; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1( -; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; NON-POW2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer -; NON-POW2-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer -; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 -; NON-POW2-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 -; NON-POW2-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; NON-POW2-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 -; NON-POW2-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 -; NON-POW2-NEXT: ret void -; entry: %mul24 = fmul float %1, 0.000000e+00 %2 = fadd float %mul24, 0.000000e+00 @@ -656,37 +558,37 @@ entry: } define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) { -; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( -; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 -; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1 -; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2 -; POW2-ONLY-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]] -; POW2-ONLY-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> , <3 x float> ) -; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], -; POW2-ONLY-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4 -; POW2-ONLY-NEXT: ret void -; ; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding( ; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2 ; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 -; NON-POW2-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]] -; NON-POW2-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00) -; NON-POW2-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00 -; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> -; NON-POW2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]] -; NON-POW2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> , <2 x float> ) -; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], -; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4 -; NON-POW2-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1 +; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2 +; NON-POW2-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]] +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> , <3 x float> ) +; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], +; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4 ; NON-POW2-NEXT: ret void ; +; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( +; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 +; POW2-ONLY-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]] +; POW2-ONLY-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00) +; POW2-ONLY-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]] +; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> , <2 x float> ) +; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], +; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4 +; POW2-ONLY-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4 +; POW2-ONLY-NEXT: ret void +; entry: %arrayidx42.i = getelementptr float, ptr %A, i64 2 %arrayidx35.i = getelementptr float, ptr %A, i64 1 From 84cf9b90017d48879811c04662c8adb4f60be540 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 31 Jan 2024 16:21:44 +0000 Subject: [PATCH 03/23] !fixup Address latest comments, thanks! --- .../Transforms/Vectorize/SLPVectorizer.cpp | 86 +++++++------------ 1 file changed, 32 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4ac010e81e9476..c15237b733f1fc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2737,7 +2737,7 @@ class BoUpSLP { SmallVectorImpl *OpScalars = nullptr, SmallVectorImpl *AltScalars = nullptr) const; - /// Return the number of padding lanes (containg poison) for this node. + /// Return true if this is a non-power-of-2 node. bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); } #ifndef NDEBUG @@ -2900,10 +2900,8 @@ class BoUpSLP { if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); - if (!isPowerOf2_32(Last->Scalars.size())) { - assert((Last->ReorderIndices.empty()) && - "Reodering isn't implemented for nodes with padding yet"); - } + assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && + "Reordering isn't implemented for non-power-of-2 nodes yet"); } return Last; } @@ -3915,8 +3913,12 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); - if (!Order.empty() && !isPowerOf2_32(VL.size())) + // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. + if (!Order.empty() && !isPowerOf2_32(VL.size())) { + assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " + "supported with VectorizeNonPowerOf2"); return LoadsState::Gather; + } if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); @@ -4109,6 +4111,10 @@ static bool areTwoInsertFromSameBuildVector( std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { + // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. + if (TE.isNonPowOf2Vec()) + return std::nullopt; + // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { @@ -4607,7 +4613,7 @@ bool BoUpSLP::canReorderOperands( TreeEntry *UserTE, SmallVectorImpl> &Edges, ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { - // Reordering isn't implemented for nodes with padding yet. + // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. if (UserTE->isNonPowOf2Vec()) return false; @@ -4789,7 +4795,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( const TreeEntry *TE) { - // Reordering for nodes with padding not implemented yet. + // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. if (TE->isNonPowOf2Vec()) return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || @@ -5304,7 +5310,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::ExtractValue: case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); - if (Reuse || !CurrentOrder.empty()) + // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. + if (isPowerOf2_32(VL.size()) && (Reuse || !CurrentOrder.empty())) return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); return TreeEntry::NeedToGather; @@ -6409,10 +6416,6 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, SmallVectorImpl &CurrentOrder, bool ResizeAllowed) const { - // TODO: Reusing extracts is not supported yet for non-power-of-2 ops. - if (!isPowerOf2_32(VL.size())) - return false; - const auto *It = find_if(VL, [](Value *V) { return isa(V); }); @@ -7029,12 +7032,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { ? TTI::TCC_Free : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)); }; - - // TODO: Only full gather is supported for non-power-of-2 operations for + // FIXME: Only full gather is supported for non-power-of-2 operations for // now. if (!isPowerOf2_32(VL.size())) return ComputeGatherCost(); - // Improve gather cost for gather of loads, if we can group some of the // loads into vector loads. InstructionsState S = getSameOpcode(VL, *R.TLI); @@ -7172,10 +7173,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { /*SubTp=*/nullptr, /*Args=*/*It) : TTI::TCC_Free); } - return GatherCost + - (all_of(Gathers, UndefValue::classof) - ? TTI::TCC_Free - : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); + return GatherCost + ComputeGatherCost(); }; /// Compute the cost of creating a vector containing the extracted values from @@ -9789,7 +9787,7 @@ BoUpSLP::isGatherShuffledEntry( // No need to check for the topmost gather node. if (TE == VectorizableTree.front().get()) return {}; - // Gathering for nodes with padding is not implemented yet. + // FIXME: Gathering for non-power-of-2 nodes not implemented yet. if (TE->isNonPowOf2Vec()) return {}; Mask.assign(VL.size(), PoisonMaskElem); @@ -10583,6 +10581,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); reorderScalars(VL, Mask); } + const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { @@ -10624,7 +10623,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, ShuffleBuilder.add(V, Mask); return ShuffleBuilder.finalize(std::nullopt); }; - const unsigned VF = VL.size(); Value *V = vectorizeTree(VE, PostponedPHIs); if (VF != cast(V->getType())->getNumElements()) { if (!VE->ReuseShuffleIndices.empty()) { @@ -10704,17 +10702,15 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, template ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); - unsigned VF = E->getVectorFactor(); BVTy ShuffleBuilder(Params...); if (E->isNonPowOf2Vec()) { Value *BV = ShuffleBuilder.gather(E->Scalars); SmallVector Mask(VF, PoisonMaskElem); - std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0); + std::iota(Mask.begin(), Mask.end(), 0); ShuffleBuilder.add(BV, Mask); return ShuffleBuilder.finalize(E->ReuseShuffleIndices); } - bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); @@ -13641,40 +13637,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, << "MinVF (" << MinVF << ")\n"); } - unsigned StartIdx = 0; + SmallVector CandidateVFs; if (VectorizeNonPowerOf2) { - // Try vectorizing with a non-power-of-2 VF. At the moment, only + // First try vectorizing with a non-power-of-2 VF. At the moment, only // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. - unsigned CandVF = Operands.size() + 1; - if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) { - assert( - all_of( - Operands, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Operands.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Operands.front()) && - !VectorizedStores.count(Operands.back()) && - TriedSequences - .insert(std::make_pair(Operands.front(), Operands.back())) - .second && - vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; - StartIdx += Operands.size(); - } - } + unsigned CandVF = Operands.size(); + if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) + CandidateVFs.push_back(CandVF); } - - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + CandidateVFs.push_back(Size); + } + unsigned StartIdx = 0; + for (unsigned Size : CandidateVFs) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); assert( From 552b8aaf1563c9c074965dd24548d8cd446a2b2e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 1 Feb 2024 16:15:39 +0000 Subject: [PATCH 04/23] !fixup Add fixme to processBuildVector also use {} instead of (empty) ResueShuffleIndices. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c15237b733f1fc..285450ef03bc37 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10704,12 +10704,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); BVTy ShuffleBuilder(Params...); + // FIXME: Only full gathering is implemented for non-power-of-2 nodes at the + // moment. if (E->isNonPowOf2Vec()) { Value *BV = ShuffleBuilder.gather(E->Scalars); SmallVector Mask(VF, PoisonMaskElem); std::iota(Mask.begin(), Mask.end(), 0); ShuffleBuilder.add(BV, Mask); - return ShuffleBuilder.finalize(E->ReuseShuffleIndices); + return ShuffleBuilder.finalize({}); } bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), From 4bb53dd51533e79e6a63305ee530445ef247e9f6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 2 Feb 2024 14:47:52 +0000 Subject: [PATCH 05/23] !fixup undo gather cos changes. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c0a5b3d326eeb9..aa816d1c21ab47 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7036,15 +7036,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL.begin(), VL.end()); - auto ComputeGatherCost = [&]() { - return all_of(Gathers, UndefValue::classof) - ? TTI::TCC_Free - : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)); - }; - // FIXME: Only full gather is supported for non-power-of-2 operations for - // now. - if (!isPowerOf2_32(VL.size())) - return ComputeGatherCost(); // Improve gather cost for gather of loads, if we can group some of the // loads into vector loads. InstructionsState S = getSameOpcode(VL, *R.TLI); @@ -7180,7 +7171,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { /*SubTp=*/nullptr, /*Args=*/*It) : TTI::TCC_Free); } - return GatherCost + ComputeGatherCost(); + return GatherCost + + (all_of(Gathers, UndefValue::classof) + ? TTI::TCC_Free + : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; /// Compute the cost of creating a vector containing the extracted values from @@ -10718,7 +10712,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { SmallVector Mask(VF, PoisonMaskElem); std::iota(Mask.begin(), Mask.end(), 0); ShuffleBuilder.add(BV, Mask); - return ShuffleBuilder.finalize({}); + return ShuffleBuilder.finalize(std::nullopt); } bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), From cabbe058b6bec5ca4099a8ab892bf2ec2f5d84a9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 13 Feb 2024 18:04:23 +0000 Subject: [PATCH 06/23] !fixup remove escape hatch for non-power-of-2 vectors from processBV. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 12 +- .../AArch64/vec3-reorder-reshuffle.ll | 72 +++--- .../Transforms/SLPVectorizer/X86/vec3-base.ll | 35 +-- .../SLPVectorizer/X86/vec3-calls.ll | 33 ++- .../X86/vec3-gather-some-loads.ll | 74 +++--- .../X86/vec3-reorder-reshuffle.ll | 211 +++++++++++------- 6 files changed, 261 insertions(+), 176 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 036b073932091f..de5a23e342f2f2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7185,7 +7185,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned I = 0, End = VL.size(); I < End; I += VF) { if (VectorizedLoads.contains(VL[I])) continue; - GatherCost += getBuildVectorCost(VL.slice(I, VF), Root); + GatherCost += getBuildVectorCost( + VL.slice(I, std::min(VL.size() - I, size_t(VF))), Root); } // Exclude potentially vectorized loads from list of gathered // scalars. @@ -10745,15 +10746,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); BVTy ShuffleBuilder(Params...); - // FIXME: Only full gathering is implemented for non-power-of-2 nodes at the - // moment. - if (E->isNonPowOf2Vec()) { - Value *BV = ShuffleBuilder.gather(E->Scalars); - SmallVector Mask(VF, PoisonMaskElem); - std::iota(Mask.begin(), Mask.end(), 0); - ShuffleBuilder.add(BV, Mask); - return ShuffleBuilder.finalize(std::nullopt); - } bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index e405d755237a7f..dd967030bc5b4e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -181,13 +181,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> ; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1 -; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2 -; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]] -; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]]) -; NON-POW2-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer) -; NON-POW2-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4 +; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 ; NON-POW2-NEXT: ret i32 0 ; ; POW2-ONLY-LABEL: define i32 @reorder_indices_1( @@ -249,16 +248,10 @@ define void @reorder_indices_2(ptr %spoint) { ; NON-POW2-LABEL: define void @reorder_indices_2( ; NON-POW2-SAME: ptr [[SPOINT:%.*]]) { ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1 -; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2 -; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 ; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1 -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2 -; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer -; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @reorder_indices_2( @@ -330,7 +323,7 @@ define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 ; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 -; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> ; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 @@ -532,18 +525,24 @@ entry: } define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) { -; CHECK-LABEL: define void @vec3_extract( -; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 -; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 -; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 -; CHECK-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1 -; CHECK-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1 -; CHECK-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2 -; CHECK-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0 -; CHECK-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @vec3_extract( +; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @vec3_extract( +; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 +; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 +; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 +; POW2-ONLY-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1 +; POW2-ONLY-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1 +; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2 +; POW2-ONLY-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0 +; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2 +; POW2-ONLY-NEXT: ret void ; entry: %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2 @@ -561,16 +560,11 @@ define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) { ; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding( ; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) { ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1 -; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 -; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1 -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2 -; NON-POW2-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> , <3 x float> ) -; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], -; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4 +; NON-POW2-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[TMP0]], [[TMP0]] +; NON-POW2-NEXT: [[TMP2:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> , <3 x float> ) +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <3 x float> [[TMP2]], +; NON-POW2-NEXT: store <3 x float> [[TMP3]], ptr [[A]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll index 6560fc6a145264..96d4b84e036918 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) { ; CHECK-LABEL: @v3_load_i32_mul_by_constant_store( @@ -161,18 +162,26 @@ entry: } define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) { -; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 -; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 -; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 -; CHECK-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], -; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 -; CHECK-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], +; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 +; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 +; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 +; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], +; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 +; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index 71b9315839ecff..243087c6d8d95b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -1,16 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s define void @vec3_vectorize_call(ptr %Colour, float %0) { -; CHECK-LABEL: @vec3_vectorize_call( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4 -; CHECK-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @vec3_vectorize_call( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4 +; NON-POW2-NEXT: [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1 +; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1 +; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @vec3_vectorize_call( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00) +; POW2-ONLY-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %1 = load float, ptr %Colour, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll index 1411f9416f69df..e8adda0bdc7034 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll @@ -1,35 +1,55 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=POW2-ONLY %s target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" define void @test_insert_loads(ptr %A, ptr noalias %B, float %0) #0 { -; CHECK-LABEL: define void @test_insert_loads( -; CHECK-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00) -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> , <2 x float> ) -; CHECK-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28 -; CHECK-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4 -; CHECK-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12 -; CHECK-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4 -; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4 -; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28 -; CHECK-NEXT: [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> , float [[L_A_12]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[L_B_0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]]) -; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4 -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4 -; CHECK-NEXT: store float [[MULADD_0]], ptr [[GEP_28]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @test_insert_loads( +; NON-POW2-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28 +; NON-POW2-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4 +; NON-POW2-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12 +; NON-POW2-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4 +; NON-POW2-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4 +; NON-POW2-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <7 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <7 x float> [[TMP1]], <7 x float> poison, <7 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <7 x float> , float [[L_A_12]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <7 x float> [[TMP3]], float [[L_A_28]], i32 1 +; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <7 x float> [[TMP4]], <7 x float> poison, <7 x i32> +; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <7 x float> , float [[L_B_0]], i32 0 +; NON-POW2-NEXT: [[TMP7:%.*]] = call <7 x float> @llvm.fmuladd.v7f32(<7 x float> [[TMP2]], <7 x float> [[TMP5]], <7 x float> [[TMP6]]) +; NON-POW2-NEXT: store <7 x float> [[TMP7]], ptr [[GEP_4]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @test_insert_loads( +; POW2-ONLY-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00) +; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> , <2 x float> ) +; POW2-ONLY-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28 +; POW2-ONLY-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4 +; POW2-ONLY-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12 +; POW2-ONLY-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4 +; POW2-ONLY-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4 +; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4 +; POW2-ONLY-NEXT: [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28 +; POW2-ONLY-NEXT: [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <4 x float> , float [[L_A_12]], i32 0 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1 +; POW2-ONLY-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[L_B_0]], i32 0 +; POW2-ONLY-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]]) +; POW2-ONLY-NEXT: store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4 +; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4 +; POW2-ONLY-NEXT: store float [[MULADD_0]], ptr [[GEP_28]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %muladd.0 = tail call float @llvm.fmuladd.f32(float %0, float 1.000000e+00, float 1.000000e+00) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 9584a663b2d486..1fafe72fbfa485 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s %struct.zot = type { i32, i32, i32 } @@ -138,21 +139,35 @@ if.end668: ; preds = %if.then665, %entry } define void @gather_2(ptr %mat1, float %0, float %1) { -; CHECK-LABEL: define void @gather_2( -; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> , <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00 -; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 -; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4 -; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @gather_2( +; NON-POW2-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP1]], i32 1 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> , float [[TMP0]], i32 1 +; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP1]], i32 2 +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> [[TMP6]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 +; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @gather_2( +; POW2-ONLY-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> , <2 x i32> +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00 +; POW2-ONLY-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 +; POW2-ONLY-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2 +; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4 +; POW2-ONLY-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %2 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00) @@ -171,32 +186,48 @@ entry: } define i32 @reorder_indices_1(float %0) { -; CHECK-LABEL: define i32 @reorder_indices_1( -; CHECK-SAME: float [[TMP0:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 -; CHECK-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] -; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 -; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 -; CHECK-NEXT: ret i32 0 +; NON-POW2-LABEL: define i32 @reorder_indices_1( +; NON-POW2-SAME: float [[TMP0:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) +; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 +; NON-POW2-NEXT: ret i32 0 +; +; POW2-ONLY-LABEL: define i32 @reorder_indices_1( +; POW2-ONLY-SAME: float [[TMP0:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] +; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) +; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] +; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] +; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) +; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer +; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 +; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 +; POW2-ONLY-NEXT: ret i32 0 ; entry: %nor1 = alloca [0 x [3 x float]], i32 0, align 4 @@ -227,19 +258,28 @@ entry: } define void @reorder_indices_2(ptr %spoint) { -; CHECK-LABEL: define void @reorder_indices_2( -; CHECK-SAME: ptr [[SPOINT:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 -; CHECK-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 -; CHECK-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 -; CHECK-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @reorder_indices_2( +; NON-POW2-SAME: ptr [[SPOINT:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reorder_indices_2( +; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00) +; POW2-ONLY-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00 +; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2 +; POW2-ONLY-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %0 = extractelement <3 x float> zeroinitializer, i64 1 @@ -291,19 +331,30 @@ entry: } define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) { -; CHECK-LABEL: define void @reuse_shuffle_indidces_1( -; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 -; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 -; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1( +; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer +; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1( +; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer +; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2 +; POW2-ONLY-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00 +; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %mul24 = fmul float %1, 0.000000e+00 @@ -488,15 +539,21 @@ entry: } define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) { -; CHECK-LABEL: define void @vec3_extract( -; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 -; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 -; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> -; CHECK-NEXT: store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @vec3_extract( +; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @vec3_extract( +; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2 +; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2 +; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2 +; POW2-ONLY-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> +; POW2-ONLY-NEXT: store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2 +; POW2-ONLY-NEXT: ret void ; entry: %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2 From f30c75389027a006936fd4434b3c270b4b50c1e3 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 13 Feb 2024 18:28:52 +0000 Subject: [PATCH 07/23] !fixup removed with in wrong place --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index de5a23e342f2f2..59f92ca60eece2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -190,7 +190,7 @@ static cl::opt static cl::opt VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, - cl::desc("Try to vectorize with non-power-of-2 with number of elements.")); + cl::desc("Try to vectorize with non-power-of-2 number of elements.")); // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. From f15ddd902c11198c064e901322da8fea225d999e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 13 Feb 2024 20:50:28 +0000 Subject: [PATCH 08/23] !fixup also update odd_store.ll --- .../Transforms/SLPVectorizer/X86/odd_store.ll | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll index 853b4f396aaa50..5f2c42d5c2dec8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll @@ -13,13 +13,12 @@ define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) { ; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4 ; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[T]], i32 1 -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[T]], i32 2 -; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP2]], [[TMP5]] -; NON-POW2-NEXT: [[TMP7:%.*]] = fpext <3 x float> [[TMP6]] to <3 x double> -; NON-POW2-NEXT: [[TMP8:%.*]] = fadd <3 x double> [[TMP7]], -; NON-POW2-NEXT: [[TMP9:%.*]] = fptosi <3 x double> [[TMP8]] to <3 x i8> -; NON-POW2-NEXT: store <3 x i8> [[TMP9]], ptr [[A:%.*]], align 1 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]] +; NON-POW2-NEXT: [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double> +; NON-POW2-NEXT: [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], +; NON-POW2-NEXT: [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8> +; NON-POW2-NEXT: store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1 ; NON-POW2-NEXT: ret i32 undef ; ; POW2-ONLY-LABEL: @foo( @@ -105,13 +104,18 @@ define void @test_v4f32_v2f32_splat_store(<4 x float> %f, ptr %p){ } define void @test_v4f32_v3f32_store(<4 x float> %f, ptr %p){ -; CHECK-LABEL: @test_v4f32_v3f32_store( -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F:%.*]], i64 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[P]], align 4 -; CHECK-NEXT: store float [[X2]], ptr [[P2]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @test_v4f32_v3f32_store( +; NON-POW2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <3 x i32> +; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[P:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @test_v4f32_v3f32_store( +; POW2-ONLY-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F:%.*]], i64 2 +; POW2-ONLY-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 2 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[P]], align 4 +; POW2-ONLY-NEXT: store float [[X2]], ptr [[P2]], align 4 +; POW2-ONLY-NEXT: ret void ; %x0 = extractelement <4 x float> %f, i64 0 %x1 = extractelement <4 x float> %f, i64 1 From 5cd569b1dd922aa7abaeeaeba55e41d85e65443b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 22 Feb 2024 19:33:02 +0000 Subject: [PATCH 09/23] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e2749de353d4a5..3e8172934a747e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7186,7 +7186,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (VectorizedLoads.contains(VL[I])) continue; GatherCost += getBuildVectorCost( - VL.slice(I, std::min(VL.size() - I, size_t(VF))), Root); + VL.slice(I, std::min(VL.size() - I, VF)), Root); } // Exclude potentially vectorized loads from list of gathered // scalars. @@ -10790,6 +10790,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { } return true; }; + BVTy ShuffleBuilder(Params...); ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask(GatheredScalars.size(), PoisonMaskElem); From e189eec90a234d77cf7dc3fd5a6be65f8e84ad54 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 23 Feb 2024 16:52:57 +0000 Subject: [PATCH 10/23] [SLP] Collect candidate VFs in vector in vectorizeStores (NFC). This is in preparation for https://github.com/llvm/llvm-project/pull/77790 and makes it easy to add other, non-power-of-2 VFs for processing. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index de4e56ff80659a..8ee840e97e94b7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13918,10 +13918,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, << "MinVF (" << MinVF << ")\n"); } - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? - unsigned StartIdx = 0; + SmallVector CandidateVFs; for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + CandidateVFs.push_back(Size); + } + unsigned StartIdx = 0; + for (unsigned Size : CandidateVFs) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); assert( From b6dac7bc363b1df18304f39b4bb896421bcbf3e5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 23 Feb 2024 16:59:13 +0000 Subject: [PATCH 11/23] !fixup update tests after merge. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 - .../AArch64/vec3-reorder-reshuffle.ll | 17 ++++++++--------- .../SLPVectorizer/X86/vec3-reorder-reshuffle.ll | 17 ++++++++--------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9aa3fbc631729c..d8ebced31bdac9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11019,7 +11019,6 @@ template ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); - BVTy ShuffleBuilder(Params...); bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 67990a50d26558..a125adde1d819c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -201,19 +201,18 @@ define i32 @reorder_indices_1(float %0) { ; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer ; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]]) +; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer +; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00 +; POW2-ONLY-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4 ; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 ; POW2-ONLY-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 1fafe72fbfa485..c28c20e5e4609f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -213,19 +213,18 @@ define i32 @reorder_indices_1(float %0) { ; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> +; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer ; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]] -; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> -; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]]) -; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer) -; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) -; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer -; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00 -; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4 +; POW2-ONLY-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]]) +; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer +; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00 +; POW2-ONLY-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4 ; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4 ; POW2-ONLY-NEXT: ret i32 0 ; From 8e7339aa06534156d18c9d179c457aeea16755cc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 28 Feb 2024 16:03:14 +0000 Subject: [PATCH 12/23] [SLP] Exit early . --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 743eaf9039b285..f7bfb0d506e3e0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13912,10 +13912,11 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, unsigned MinVF = TTI->getStoreMinimumVF( R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); - if (MaxVF <= MinVF) { + if (MaxVF < MinVF) { LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " << "MinVF (" << MinVF << ")\n"); + return; } SmallVector CandidateVFs; From 3eacfa64a35f0aed09180c00cbb5272c4a4c9ca0 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 28 Feb 2024 16:03:14 +0000 Subject: [PATCH 13/23] [SLP] Exit early if MaxVF < MinVF (NFCI). Exit early if MaxVF < MinVF. In that case, the loop body below will never get entered. Note that this adjusts the condition from MaxVF <= MinVF. If MaxVF == MinVF, vectorization may still be feasible (and the loop below gets entered). --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2b7d518c1c1a78..e381cd2c5794b1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13912,10 +13912,11 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, unsigned MinVF = TTI->getStoreMinimumVF( R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy); - if (MaxVF <= MinVF) { + if (MaxVF < MinVF) { LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= " << "MinVF (" << MinVF << ")\n"); + return; } // FIXME: Is division-by-2 the correct step? Should we assert that the From 8b6b0e820792b1950abfebb5e3b8cb8122628d62 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 28 Feb 2024 16:13:04 +0000 Subject: [PATCH 14/23] !fixup use for_each. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f7bfb0d506e3e0..9dce67328d95b0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13919,12 +13919,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, return; } - SmallVector CandidateVFs; - for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? - CandidateVFs.push_back(Size); - } + unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); + SmallVector CandidateVFs(Sz); + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + unsigned Size = MaxVF; + for_each(CandidateVFs, [&](unsigned &VF) { + VF = Size; + Size /= 2; + }); unsigned StartIdx = 0; for (unsigned Size : CandidateVFs) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { From 4d8c47de417617ba794bd63909e55e790fc39d0c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 1 Mar 2024 20:11:42 +0000 Subject: [PATCH 15/23] !fixup add non-power-of-2 VF correctly. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f1e56f0f4ee5bc..e8435d75bd7fc2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14074,7 +14074,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, continue; } - unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF) + 1; + unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); SmallVector CandidateVFs(Sz); auto VFsToFill = make_range(CandidateVFs.begin(), CandidateVFs.end()); if (VectorizeNonPowerOf2) { @@ -14084,6 +14084,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, unsigned CandVF = Operands.size(); if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { CandidateVFs[0] = CandVF; + CandidateVFs.push_back(0); VFsToFill = make_range(CandidateVFs.begin() + 1, CandidateVFs.end()); } } From fb1c7be20d558dab9a4ca7d0897d38524de5dc14 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 6 Mar 2024 11:25:31 +0000 Subject: [PATCH 16/23] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a5aa48bdfaf9df..15a211d7ad079c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7645,7 +7645,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { if (VectorizedLoads.contains(VL[I])) continue; GatherCost += getBuildVectorCost( - VL.slice(I, std::min(VL.size() - I, VF)), Root); + VL.slice(I, std::min(End - I, VF)), Root); } // Exclude potentially vectorized loads from list of gathered // scalars. @@ -11268,6 +11268,7 @@ template ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); + bool NeedFreeze = false; SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); From 4c1197ab603de6bd1d8312c9e10c2a250cbe6904 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 6 Mar 2024 11:30:30 +0000 Subject: [PATCH 17/23] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 15a211d7ad079c..e9de536539f36a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7644,8 +7644,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned I = 0, End = VL.size(); I < End; I += VF) { if (VectorizedLoads.contains(VL[I])) continue; - GatherCost += getBuildVectorCost( - VL.slice(I, std::min(End - I, VF)), Root); + GatherCost += + getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); } // Exclude potentially vectorized loads from list of gathered // scalars. From 210210fa824003070187baadf73dbc4dcc121d36 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 8 Mar 2024 16:09:33 +0000 Subject: [PATCH 18/23] !fixup add separate early exit for Non-power-of-2 VFs. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e9de536539f36a..a9d9d7abd78a8e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5776,7 +5776,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (isPowerOf2_32(VL.size()) && (Reuse || !CurrentOrder.empty())) + if (!isPowerOf2_32(VL.size())) + return TreeEntry::NeedToGather; + if ((Reuse || !CurrentOrder.empty())) return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); return TreeEntry::NeedToGather; From 981a3d4b25ce234346f38bfc14c9f20ee799894d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 19 Mar 2024 17:32:47 +0000 Subject: [PATCH 19/23] !fixup adjust VF computation as suggested --- .../Transforms/Vectorize/SLPVectorizer.cpp | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7bfc970ed91818..ac6458debf124e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14271,26 +14271,23 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, continue; } - unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); - SmallVector CandidateVFs(Sz); - auto VFsToFill = make_range(CandidateVFs.begin(), CandidateVFs.end()); + std::optional NonPowerOf2VF; if (VectorizeNonPowerOf2) { // First try vectorizing with a non-power-of-2 VF. At the moment, only // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. unsigned CandVF = Operands.size(); if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { - CandidateVFs[0] = CandVF; - CandidateVFs.push_back(0); - VFsToFill = make_range(CandidateVFs.begin() + 1, CandidateVFs.end()); + NonPowerOf2VF = CandVF; } } - // FIXME: Is division-by-2 the correct step? Should we assert that the - // register size is a power-of-2? - unsigned Size = MaxVF; - for_each(VFsToFill, [&](unsigned &VF) { - VF = Size; - Size /= 2; + + unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); + SmallVector CandidateVFs(Sz + bool(NonPowerOf2VF)); + unsigned Size = MinVF; + for_each(reverse(CandidateVFs), [&](unsigned &VF) { + VF = Size > MaxVF ? *NonPowerOf2VF : Size; + Size *= 2; }); unsigned StartIdx = 0; for (unsigned Size : CandidateVFs) { From c52b68ce6e35a16cb5e9f31afbfc579ce2d94532 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 4 Apr 2024 16:27:14 +0100 Subject: [PATCH 20/23] !fixup address comments, update after upstream changes. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 19 +++---- .../AArch64/vec3-reorder-reshuffle.ll | 43 ++++++++++------ .../X86/vec3-reorder-reshuffle.ll | 51 +++++++++++-------- 3 files changed, 66 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f1cbd1f647219f..f06d2e18c82363 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6141,6 +6141,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " + "for nodes with padding.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return false; + } + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || (UniquePositions.size() == 1 && all_of(UniqueValues, @@ -6160,12 +6167,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (PWSz == VL.size()) { ReuseShuffleIndicies.clear(); } else { - if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { - LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " - "for nodes with padding.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - return false; - } NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); NonUniqueValueVL.append(PWSz - UniqueValues.size(), UniqueValues.back()); @@ -6177,12 +6178,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; } - if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { - LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for " - "nodes with padding.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - return false; - } VL = UniqueValues; } return true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index d9f51075aab249..47d918eabdfe2b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -140,21 +140,34 @@ if.end668: ; preds = %if.then665, %entry } define void @gather_2(ptr %mat1, float %0, float %1) { -; CHECK-LABEL: define void @gather_2( -; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer) -; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00 -; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 -; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4 -; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @gather_2( +; NON-POW2-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> , float [[TMP1]], i32 1 +; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP3]], <3 x float> [[TMP5]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 +; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer +; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[ARRAYIDX163]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @gather_2( +; POW2-ONLY-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 1 +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) +; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00 +; POW2-ONLY-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 +; POW2-ONLY-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2 +; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer +; POW2-ONLY-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4 +; POW2-ONLY-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %2 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index c28c20e5e4609f..1399b4c35c781d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -143,10 +143,9 @@ define void @gather_2(ptr %mat1, float %0, float %1) { ; NON-POW2-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP1]], i32 1 -; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> , float [[TMP0]], i32 1 -; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP1]], i32 2 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> , float [[TMP1]], i32 1 +; NON-POW2-NEXT: [[TMP6:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <3 x i32> ; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> [[TMP6]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1 ; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], zeroinitializer @@ -157,8 +156,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) { ; POW2-ONLY-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) { ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; POW2-ONLY-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> , <2 x i32> +; POW2-ONLY-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 1 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) ; POW2-ONLY-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00) ; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00 @@ -401,20 +400,32 @@ entry: } define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) { -; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2( -; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) -; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[BEZT]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2 -; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @reuse_shuffle_indices_cost_crash_2( +; NON-POW2-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[FNEG]], i32 0 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> , float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP2]], <3 x float> [[TMP4]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[BEZT]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reuse_shuffle_indices_cost_crash_2( +; POW2-ONLY-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[BEZT]], align 4 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00) +; POW2-ONLY-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2 +; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %fneg = fmul float %0, 0.000000e+00 From 8d1b5d43d4e3cf18d368ce135e269105bce77d88 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 4 Apr 2024 16:40:21 +0100 Subject: [PATCH 21/23] !fixup remove newline --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f06d2e18c82363..366b709c46c895 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6147,7 +6147,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; } - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || (UniquePositions.size() == 1 && all_of(UniqueValues, From b7ccdd4aeaef9bd6d82904127444b3ad525f61fe Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 5 Apr 2024 19:48:43 +0100 Subject: [PATCH 22/23] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fbd5046e923934..cef64ada7b1a1d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5835,7 +5835,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. if (!isPowerOf2_32(VL.size())) return TreeEntry::NeedToGather; - if ((Reuse || !CurrentOrder.empty())) + if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); return TreeEntry::NeedToGather; @@ -6141,6 +6141,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { + // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); @@ -14928,22 +14929,21 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, continue; } - std::optional NonPowerOf2VF; + unsigned NonPowerOf2VF = 0; if (VectorizeNonPowerOf2) { // First try vectorizing with a non-power-of-2 VF. At the moment, only // consider cases where VF + 1 is a power-of-2, i.e. almost all vector // lanes are used. unsigned CandVF = Operands.size(); - if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { + if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) NonPowerOf2VF = CandVF; - } } unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); - SmallVector CandidateVFs(Sz + bool(NonPowerOf2VF)); + SmallVector CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0)); unsigned Size = MinVF; for_each(reverse(CandidateVFs), [&](unsigned &VF) { - VF = Size > MaxVF ? *NonPowerOf2VF : Size; + VF = Size > MaxVF ? NonPowerOf2VF : Size; Size *= 2; }); unsigned StartIdx = 0; From 3919ee6627bed5205c24a6c2f170d8046359e15c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 9 Apr 2024 11:18:05 +0100 Subject: [PATCH 23/23] !fixup add assert --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e99918c27061fd..1658ed15eb562b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2811,7 +2811,12 @@ class BoUpSLP { SmallVectorImpl *AltScalars = nullptr) const; /// Return true if this is a non-power-of-2 node. - bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); } + bool isNonPowOf2Vec() const { + bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size()); + assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && + "Reshuffling not supported with non-power-of-2 vectors yet."); + return IsNonPowerOf2; + } #ifndef NDEBUG /// Debug printer.