@@ -2864,6 +2864,12 @@ class BoUpSLP {
28642864 /// avoid issues with def-use order.
28652865 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
28662866
2867+ TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
2868+ const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
2869+ unsigned NodeIdx) const {
2870+ return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
2871+ }
2872+
28672873 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
28682874 /// \p E.
28692875 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
@@ -6964,6 +6970,55 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
69646970 }
69656971 }
69666972
6973+ // Check if this is a duplicate of another entry.
6974+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6975+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6976+ if (!E->isSame(VL)) {
6977+ auto It = MultiNodeScalars.find(S.OpValue);
6978+ if (It != MultiNodeScalars.end()) {
6979+ auto *TEIt = find_if(It->getSecond(),
6980+ [&](TreeEntry *ME) { return ME->isSame(VL); });
6981+ if (TEIt != It->getSecond().end())
6982+ E = *TEIt;
6983+ else
6984+ E = nullptr;
6985+ } else {
6986+ E = nullptr;
6987+ }
6988+ }
6989+ if (!E) {
6990+ if (!doesNotNeedToBeScheduled(S.OpValue)) {
6991+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6992+ if (TryToFindDuplicates(S))
6993+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6994+ ReuseShuffleIndices);
6995+ return;
6996+ }
6997+ SmallPtrSet<const TreeEntry *, 4> Nodes;
6998+ Nodes.insert(getTreeEntry(S.OpValue));
6999+ for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7000+ Nodes.insert(E);
7001+ SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7002+ if (any_of(Nodes, [&](const TreeEntry *E) {
7003+ return all_of(E->Scalars,
7004+ [&](Value *V) { return Values.contains(V); });
7005+ })) {
7006+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7007+ if (TryToFindDuplicates(S))
7008+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7009+ ReuseShuffleIndices);
7010+ return;
7011+ }
7012+ } else {
7013+ // Record the reuse of the tree node. FIXME, currently this is only used
7014+ // to properly draw the graph rather than for the actual vectorization.
7015+ E->UserTreeIndices.push_back(UserTreeIdx);
7016+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7017+ << ".\n");
7018+ return;
7019+ }
7020+ }
7021+
69677022 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
69687023 // a load), in which case peek through to include it in the tree, without
69697024 // ballooning over-budget.
@@ -7095,55 +7150,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
70957150 // We now know that this is a vector of instructions of the same type from
70967151 // the same block.
70977152
7098- // Check if this is a duplicate of another entry.
7099- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7100- LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
7101- if (!E->isSame(VL)) {
7102- auto It = MultiNodeScalars.find(S.OpValue);
7103- if (It != MultiNodeScalars.end()) {
7104- auto *TEIt = find_if(It->getSecond(),
7105- [&](TreeEntry *ME) { return ME->isSame(VL); });
7106- if (TEIt != It->getSecond().end())
7107- E = *TEIt;
7108- else
7109- E = nullptr;
7110- } else {
7111- E = nullptr;
7112- }
7113- }
7114- if (!E) {
7115- if (!doesNotNeedToBeScheduled(S.OpValue)) {
7116- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
7117- if (TryToFindDuplicates(S))
7118- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7119- ReuseShuffleIndices);
7120- return;
7121- }
7122- SmallPtrSet<const TreeEntry *, 4> Nodes;
7123- Nodes.insert(getTreeEntry(S.OpValue));
7124- for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7125- Nodes.insert(E);
7126- SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7127- if (any_of(Nodes, [&](const TreeEntry *E) {
7128- return all_of(E->Scalars,
7129- [&](Value *V) { return Values.contains(V); });
7130- })) {
7131- LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7132- if (TryToFindDuplicates(S))
7133- newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7134- ReuseShuffleIndices);
7135- return;
7136- }
7137- } else {
7138- // Record the reuse of the tree node. FIXME, currently this is only used
7139- // to properly draw the graph rather than for the actual vectorization.
7140- E->UserTreeIndices.push_back(UserTreeIdx);
7141- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7142- << ".\n");
7143- return;
7144- }
7145- }
7146-
71477153 // Check that none of the instructions in the bundle are already in the tree.
71487154 for (Value *V : VL) {
71497155 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
@@ -9362,22 +9368,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
93629368
93639369const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
93649370 unsigned Idx) const {
9365- Value *Op = E->getOperand(Idx).front();
9366- if (const TreeEntry *TE = getTreeEntry(Op)) {
9367- if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9368- return EI.EdgeIdx == Idx && EI.UserTE == E;
9369- }) != TE->UserTreeIndices.end())
9370- return TE;
9371- auto MIt = MultiNodeScalars.find(Op);
9372- if (MIt != MultiNodeScalars.end()) {
9373- for (const TreeEntry *TE : MIt->second) {
9374- if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9375- return EI.EdgeIdx == Idx && EI.UserTE == E;
9376- }) != TE->UserTreeIndices.end())
9377- return TE;
9378- }
9379- }
9380- }
9371+ if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
9372+ return VE;
93819373 const auto *It =
93829374 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
93839375 return TE->isGather() &&
@@ -12521,120 +12513,123 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1252112513 }
1252212514};
1252312515
12524- Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12525- bool PostponedPHIs) {
12526- ValueList &VL = E->getOperand(NodeIdx);
12527- const unsigned VF = VL.size();
12516+ BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
12517+ unsigned NodeIdx) {
12518+ ArrayRef<Value *> VL = E->getOperand(NodeIdx);
1252812519 InstructionsState S = getSameOpcode(VL, *TLI);
1252912520 // Special processing for GEPs bundle, which may include non-gep values.
1253012521 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
1253112522 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
1253212523 if (It != VL.end())
1253312524 S = getSameOpcode(*It, *TLI);
1253412525 }
12535- if (S.getOpcode()) {
12536- auto CheckSameVE = [&](const TreeEntry *VE) {
12537- return VE->isSame(VL) &&
12538- (any_of(VE->UserTreeIndices,
12539- [E, NodeIdx](const EdgeInfo &EI) {
12540- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12541- }) ||
12542- any_of(VectorizableTree,
12543- [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12544- return TE->isOperandGatherNode({E, NodeIdx}) &&
12545- VE->isSame(TE->Scalars);
12546- }));
12526+ if (!S.getOpcode())
12527+ return nullptr;
12528+ auto CheckSameVE = [&](const TreeEntry *VE) {
12529+ return VE->isSame(VL) &&
12530+ (any_of(VE->UserTreeIndices,
12531+ [E, NodeIdx](const EdgeInfo &EI) {
12532+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12533+ }) ||
12534+ any_of(VectorizableTree,
12535+ [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12536+ return TE->isOperandGatherNode(
12537+ {const_cast<TreeEntry *>(E), NodeIdx}) &&
12538+ VE->isSame(TE->Scalars);
12539+ }));
12540+ };
12541+ TreeEntry *VE = getTreeEntry(S.OpValue);
12542+ if (VE && CheckSameVE(VE))
12543+ return VE;
12544+ auto It = MultiNodeScalars.find(S.OpValue);
12545+ if (It != MultiNodeScalars.end()) {
12546+ auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12547+ return TE != VE && CheckSameVE(TE);
12548+ });
12549+ if (I != It->getSecond().end())
12550+ return *I;
12551+ }
12552+ return nullptr;
12553+ }
12554+
12555+ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12556+ bool PostponedPHIs) {
12557+ ValueList &VL = E->getOperand(NodeIdx);
12558+ const unsigned VF = VL.size();
12559+ if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
12560+ auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12561+ // V may be affected by MinBWs.
12562+ // We want ShuffleInstructionBuilder to correctly support REVEC. The key
12563+ // factor is the number of elements, not their type.
12564+ Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
12565+ unsigned NumElements = getNumElements(VL.front()->getType());
12566+ ShuffleInstructionBuilder ShuffleBuilder(
12567+ NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
12568+ : ScalarTy,
12569+ Builder, *this);
12570+ ShuffleBuilder.add(V, Mask);
12571+ SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
12572+ E->CombinedEntriesWithIndices.size());
12573+ transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
12574+ [&](const auto &P) {
12575+ return std::make_pair(VectorizableTree[P.first].get(),
12576+ P.second);
12577+ });
12578+ return ShuffleBuilder.finalize(std::nullopt, SubVectors);
1254712579 };
12548- TreeEntry *VE = getTreeEntry(S.OpValue);
12549- bool IsSameVE = VE && CheckSameVE(VE);
12550- if (!IsSameVE) {
12551- auto It = MultiNodeScalars.find(S.OpValue);
12552- if (It != MultiNodeScalars.end()) {
12553- auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12554- return TE != VE && CheckSameVE(TE);
12555- });
12556- if (I != It->getSecond().end()) {
12557- VE = *I;
12558- IsSameVE = true;
12559- }
12560- }
12561- }
12562- if (IsSameVE) {
12563- auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12564- // V may be affected by MinBWs.
12565- // We want ShuffleInstructionBuilder to correctly support REVEC. The key
12566- // factor is the number of elements, not their type.
12567- Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
12568- unsigned NumElements = getNumElements(VL.front()->getType());
12569- ShuffleInstructionBuilder ShuffleBuilder(
12570- NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
12571- : ScalarTy,
12572- Builder, *this);
12573- ShuffleBuilder.add(V, Mask);
12574- SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
12575- E->CombinedEntriesWithIndices.size());
12576- transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
12577- [&](const auto &P) {
12578- return std::make_pair(VectorizableTree[P.first].get(),
12579- P.second);
12580- });
12581- return ShuffleBuilder.finalize(std::nullopt, SubVectors);
12582- };
12583- Value *V = vectorizeTree(VE, PostponedPHIs);
12584- if (VF * getNumElements(VL[0]->getType()) !=
12585- cast<FixedVectorType>(V->getType())->getNumElements()) {
12586- if (!VE->ReuseShuffleIndices.empty()) {
12587- // Reshuffle to get only unique values.
12588- // If some of the scalars are duplicated in the vectorization
12589- // tree entry, we do not vectorize them but instead generate a
12590- // mask for the reuses. But if there are several users of the
12591- // same entry, they may have different vectorization factors.
12592- // This is especially important for PHI nodes. In this case, we
12593- // need to adapt the resulting instruction for the user
12594- // vectorization factor and have to reshuffle it again to take
12595- // only unique elements of the vector. Without this code the
12596- // function incorrectly returns reduced vector instruction with
12597- // the same elements, not with the unique ones.
12598-
12599- // block:
12600- // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12601- // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12602- // ... (use %2)
12603- // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12604- // br %block
12605- SmallVector<int> Mask(VF, PoisonMaskElem);
12606- for (auto [I, V] : enumerate(VL)) {
12607- if (isa<PoisonValue>(V))
12608- continue;
12609- Mask[I] = VE->findLaneForValue(V);
12610- }
12611- V = FinalShuffle(V, Mask);
12612- } else {
12613- assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12614- "Expected vectorization factor less "
12615- "than original vector size.");
12616- SmallVector<int> UniformMask(VF, 0);
12617- std::iota(UniformMask.begin(), UniformMask.end(), 0);
12618- V = FinalShuffle(V, UniformMask);
12580+ Value *V = vectorizeTree(VE, PostponedPHIs);
12581+ if (VF * getNumElements(VL[0]->getType()) !=
12582+ cast<FixedVectorType>(V->getType())->getNumElements()) {
12583+ if (!VE->ReuseShuffleIndices.empty()) {
12584+ // Reshuffle to get only unique values.
12585+ // If some of the scalars are duplicated in the vectorization
12586+ // tree entry, we do not vectorize them but instead generate a
12587+ // mask for the reuses. But if there are several users of the
12588+ // same entry, they may have different vectorization factors.
12589+ // This is especially important for PHI nodes. In this case, we
12590+ // need to adapt the resulting instruction for the user
12591+ // vectorization factor and have to reshuffle it again to take
12592+ // only unique elements of the vector. Without this code the
12593+ // function incorrectly returns reduced vector instruction with
12594+ // the same elements, not with the unique ones.
12595+
12596+ // block:
12597+ // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12598+ // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12599+ // ... (use %2)
12600+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12601+ // br %block
12602+ SmallVector<int> Mask(VF, PoisonMaskElem);
12603+ for (auto [I, V] : enumerate(VL)) {
12604+ if (isa<PoisonValue>(V))
12605+ continue;
12606+ Mask[I] = VE->findLaneForValue(V);
1261912607 }
12620- }
12621- // Need to update the operand gather node, if actually the operand is not a
12622- // vectorized node, but the buildvector/gather node, which matches one of
12623- // the vectorized nodes.
12624- if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12625- return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12626- }) == VE->UserTreeIndices.end()) {
12627- auto *It = find_if(
12628- VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12629- return TE->isGather() &&
12630- TE->UserTreeIndices.front().UserTE == E &&
12631- TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12632- });
12633- assert(It != VectorizableTree.end() && "Expected gather node operand.");
12634- (*It)->VectorizedValue = V;
12635- }
12636- return V;
12608+ V = FinalShuffle(V, Mask);
12609+ } else {
12610+ assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12611+ "Expected vectorization factor less "
12612+ "than original vector size.");
12613+ SmallVector<int> UniformMask(VF, 0);
12614+ std::iota(UniformMask.begin(), UniformMask.end(), 0);
12615+ V = FinalShuffle(V, UniformMask);
12616+ }
12617+ }
12618+ // Need to update the operand gather node, if actually the operand is not a
12619+ // vectorized node, but the buildvector/gather node, which matches one of
12620+ // the vectorized nodes.
12621+ if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12622+ return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12623+ }) == VE->UserTreeIndices.end()) {
12624+ auto *It =
12625+ find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12626+ return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
12627+ TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12628+ });
12629+ assert(It != VectorizableTree.end() && "Expected gather node operand.");
12630+ (*It)->VectorizedValue = V;
1263712631 }
12632+ return V;
1263812633 }
1263912634
1264012635 // Find the corresponding gather entry and vectorize it.
0 commit comments