-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Initial vectorization of non-power-of-2 ops. #77790
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesThis patch introduces a new VectorizeWithPadding node type for root and VectorizeWithPadding load nodes will pad the result to the next power of 2 Non-leaf nodes will operate on normal power-of-2 vectors. For those VectorizeWithPadding store nodes strip away the padding elements and Note that re-ordering and shuffling is not implemented for nodes The initial implementation also only tries to vectorize with padding if The feature is guarded by a new flag, off by defaul for now. Patch is 135.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77790.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 055fbb00871f89..a281ec3acb3b46 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -179,6 +179,10 @@ static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
+static cl::opt<bool> VectorizeWithPadding(
+ "slp-vectorize-with-padding", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize non-power-of-2 operations using padding."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -2557,7 +2561,7 @@ class BoUpSLP {
unsigned getVectorFactor() const {
if (!ReuseShuffleIndices.empty())
return ReuseShuffleIndices.size();
- return Scalars.size();
+ return Scalars.size() + getNumPadding();
};
/// A vector of scalars.
@@ -2574,6 +2578,7 @@ class BoUpSLP {
/// intrinsics for store/load)?
enum EntryState {
Vectorize,
+ VectorizeWithPadding,
ScatterVectorize,
PossibleStridedVectorize,
NeedToGather
@@ -2611,6 +2616,9 @@ class BoUpSLP {
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
+ /// The number of padding lanes (containing poison).
+ unsigned NumPadding = 0;
+
public:
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
@@ -2733,6 +2741,15 @@ class BoUpSLP {
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
+ /// Set the number of apdding lanes for this node.
+ void setNumPadding(unsigned Padding) {
+ assert(NumPadding == 0 && "Cannot change padding more than once.");
+ NumPadding = Padding;
+ }
+
+ /// Return the number of padding lanes (containg poison) for this node.
+ unsigned getNumPadding() const { return NumPadding; }
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -2750,6 +2767,9 @@ class BoUpSLP {
case Vectorize:
dbgs() << "Vectorize\n";
break;
+ case VectorizeWithPadding:
+ dbgs() << "VectorizeWithPadding\n";
+ break;
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
@@ -2790,6 +2810,8 @@ class BoUpSLP {
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
+ if (getNumPadding() > 0)
+ dbgs() << "Padding: " << getNumPadding() << "\n";
}
#endif
};
@@ -2891,9 +2913,19 @@ class BoUpSLP {
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
- if (UserTreeIdx.UserTE)
+ if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);
-
+ if (!isPowerOf2_32(Last->Scalars.size()) &&
+ Last->State != TreeEntry::VectorizeWithPadding) {
+ if (UserTreeIdx.UserTE->State == TreeEntry::VectorizeWithPadding)
+ Last->setNumPadding(1);
+ else {
+ Last->setNumPadding(UserTreeIdx.UserTE->getNumPadding());
+ assert((Last->getNumPadding() == 0 || Last->ReorderIndices.empty()) &&
+ "Reodering isn't implemented for nodes with padding yet");
+ }
+ }
+ }
return Last;
}
@@ -2921,7 +2953,8 @@ class BoUpSLP {
/// and fills required data before actual scheduling of the instructions.
TreeEntry::EntryState getScalarsVectorizationState(
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
+ bool HasPadding) const;
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3822,6 +3855,7 @@ namespace {
enum class LoadsState {
Gather,
Vectorize,
+ VectorizeWithPadding,
ScatterVectorize,
PossibleStridedVectorize
};
@@ -3898,8 +3932,10 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
std::optional<int> Diff =
getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
// Check that the sorted loads are consecutive.
+ bool NeedsPadding = !isPowerOf2_32(VL.size());
if (static_cast<unsigned>(*Diff) == VL.size() - 1)
- return LoadsState::Vectorize;
+ return NeedsPadding ? LoadsState::VectorizeWithPadding
+ : LoadsState::Vectorize;
// Simple check if not a strided access - clear order.
IsPossibleStrided = *Diff % (VL.size() - 1) == 0;
}
@@ -4534,7 +4570,8 @@ void BoUpSLP::reorderTopToBottom() {
continue;
}
if ((TE->State == TreeEntry::Vectorize ||
- TE->State == TreeEntry::PossibleStridedVectorize) &&
+ TE->State == TreeEntry::PossibleStridedVectorize ||
+ TE->State == TreeEntry::VectorizeWithPadding) &&
isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
InsertElementInst>(TE->getMainOp()) &&
!TE->isAltShuffle()) {
@@ -4568,6 +4605,10 @@ bool BoUpSLP::canReorderOperands(
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
+ // Reordering isn't implemented for nodes with padding yet.
+ if (UserTE->getNumPadding() > 0)
+ return false;
+
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
@@ -4746,6 +4787,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
const TreeEntry *TE) {
+ // Reordering for nodes with padding not implemented yet.
+ if (TE->getNumPadding() > 0 ||
+ TE->State == TreeEntry::VectorizeWithPadding)
+ return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
(IgnoreReorder && TE->Idx == 0))
@@ -5233,7 +5278,8 @@ static bool isAlternateInstruction(const Instruction *I,
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
+ bool HasPadding) const {
assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
unsigned ShuffleOrOp =
@@ -5256,7 +5302,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+ bool Reuse = !HasPadding && canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -5294,6 +5340,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
PointerOps)) {
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
+ case LoadsState::VectorizeWithPadding:
+ return TreeEntry::VectorizeWithPadding;
case LoadsState::ScatterVectorize:
return TreeEntry::ScatterVectorize;
case LoadsState::PossibleStridedVectorize:
@@ -5353,6 +5401,15 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
}
return TreeEntry::Vectorize;
}
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // The instruction may trigger immediate UB on the poison/undef padding
+ // elements, so force gather to avoid introducing new UB.
+ if (HasPadding)
+ return TreeEntry::NeedToGather;
+ [[fallthrough]];
case Instruction::Select:
case Instruction::FNeg:
case Instruction::Add:
@@ -5361,11 +5418,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
@@ -5548,6 +5601,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
bool DoNotFail = false) {
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions(VL.size());
+ auto OriginalVL = VL;
for (Value *V : VL) {
if (isConstant(V)) {
ReuseShuffleIndicies.emplace_back(
@@ -5560,6 +5614,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (Res.second)
UniqueValues.emplace_back(V);
}
+
size_t NumUniqueScalarValues = UniqueValues.size();
if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndicies.clear();
@@ -5587,6 +5642,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
UniqueValues.back());
VL = NonUniqueValueVL;
+
+ if (UserTreeIdx.UserTE &&
+ UserTreeIdx.UserTE->getNumPadding() != 0) {
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+ "for nodes with padding.\n");
+ newTreeEntry(OriginalVL, std::nullopt /*not vectorized*/, S,
+ UserTreeIdx);
+ return false;
+ }
}
return true;
}
@@ -5595,6 +5659,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return false;
}
VL = UniqueValues;
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->getNumPadding() != 0) {
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
+ "nodes with padding.\n");
+ newTreeEntry(OriginalVL, std::nullopt /*not vectorized*/, S,
+ UserTreeIdx);
+ return false;
+ }
}
return true;
};
@@ -5859,7 +5930,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
OrdersType CurrentOrder;
SmallVector<Value *> PointerOps;
TreeEntry::EntryState State = getScalarsVectorizationState(
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps,
+ UserTreeIdx.UserTE && UserTreeIdx.UserTE->getNumPadding() > 0);
if (State == TreeEntry::NeedToGather) {
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
@@ -6001,16 +6073,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
fixupOrderingIndices(CurrentOrder);
switch (State) {
case TreeEntry::Vectorize:
+ case TreeEntry::VectorizeWithPadding:
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
- TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ TE = newTreeEntry(VL, State, Bundle, S, UserTreeIdx,
ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads"
+ << (State == TreeEntry::VectorizeWithPadding
+ ? " with padding"
+ : "")
+ << ".\n");
} else {
// Need to reorder.
- TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ TE = newTreeEntry(VL, State, Bundle, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads"
+ << (State == TreeEntry::VectorizeWithPadding
+ ? " with padding"
+ : "")
+ << ".\n");
}
TE->setOperandsInOrder();
break;
@@ -6211,21 +6292,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
*OIter = SI->getValueOperand();
++OIter;
}
+ TreeEntry::EntryState State = isPowerOf2_32(VL.size())
+ ? TreeEntry::Vectorize
+ : TreeEntry::VectorizeWithPadding;
// Check that the sorted pointer operands are consecutive.
if (CurrentOrder.empty()) {
// Original stores are consecutive and does not require reordering.
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ TreeEntry *TE = newTreeEntry(VL, State, Bundle, S, UserTreeIdx,
ReuseShuffleIndicies);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores"
+ << (State == TreeEntry::VectorizeWithPadding
+ ? " with padding"
+ : "")
+ << ".\n");
} else {
fixupOrderingIndices(CurrentOrder);
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ TreeEntry *TE = newTreeEntry(VL, State, Bundle, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores"
+ << (State == TreeEntry::VectorizeWithPadding
+ ? " with padding"
+ : "")
+ << ".\n");
}
return;
}
@@ -6955,7 +7047,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return Constant::getAllOnesValue(Ty);
}
- InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
+ bool WithPadding = false) {
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
return TTI::TCC_Free;
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
@@ -6966,7 +7059,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InstructionsState S = getSameOpcode(VL, *R.TLI);
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
unsigned MinVF = R.getMinVF(2 * Sz);
- if (VL.size() > 2 &&
+ if (!WithPadding && VL.size() > 2 &&
((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
(InVectors.empty() &&
any_of(seq<unsigned>(0, VL.size() / MinVF),
@@ -7002,6 +7095,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
*R.LI, *R.TLI, CurrentOrder, PointerOps);
switch (LS) {
case LoadsState::Vectorize:
+ case LoadsState::VectorizeWithPadding:
case LoadsState::ScatterVectorize:
case LoadsState::PossibleStridedVectorize:
// Mark the vectorized loads so that we don't vectorize them
@@ -7077,7 +7171,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
GatherCost -= ScalarsCost;
}
- } else if (!Root && isSplat(VL)) {
+ } else if (!WithPadding && !Root && isSplat(VL)) {
// Found the broadcasting of the single scalar, calculate the cost as
// the broadcast.
const auto *It =
@@ -7638,8 +7732,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
CommonMask[Idx] = Mask[Idx] + VF;
}
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
- Value *Root = nullptr) {
- Cost += getBuildVectorCost(VL, Root);
+ Value *Root = nullptr, bool WithPadding = false) {
+ Cost += getBuildVectorCost(VL, Root, WithPadding);
if (!Root) {
// FIXME: Need to find a way to avoid use of getNullValue here.
SmallVector<Constant *> Vals;
@@ -7743,7 +7837,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
if (!FixedVectorType::isValidElementType(ScalarTy))
return InstructionCost::getInvalid();
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size() + E->getNumPadding());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
@@ -7751,7 +7845,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto It = MinBWs.find(E);
if (It != MinBWs.end()) {
ScalarTy = IntegerType::get(F->getContext(), It->second.first);
- VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ VecTy = FixedVectorType::get(ScalarTy, VL.size() + E->getNumPadding());
}
unsigned EntryVF = E->getVectorFactor();
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
@@ -7785,6 +7879,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
CommonCost =
TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::VectorizeWithPadding ||
E->State == TreeEntry::ScatterVectorize ||
E->State == TreeEntry::PossibleStridedVectorize) &&
"Unhandled state");
@@ -7890,7 +7985,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// loads) or (2) when Ptrs are the arguments of loads or stores being
// vectorized as plane wide unit-stride load/store since all the
// loads/stores are known to be from/to adjacent locations.
- assert(E->State == TreeEntry::Vectorize &&
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::VectorizeWithPadding) &&
"Entry state expected to be Vectorize here.");
if (isa<LoadInst, StoreInst>(VL0)) {
// Case 2: estimate costs for pointer related costs when vectorizing to
@@ -8146,7 +8242,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::BitCast: {
auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
Type *SrcScalarTy = VL0->getOperand(0)->getType();
- auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ auto *SrcVecTy =
+ FixedVectorType::get(SrcScalarTy, VL.size() + E->getNumPadding());
unsigned Opcode = ShuffleOrOp;
unsigned VecOpcode = Opcode;
if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
@@ -8156,7 +8253,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
if (SrcIt != MinBWs.end()) {
SrcBWSz = SrcIt->second.first;
SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
- SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
+ SrcVecTy =
+ FixedVectorType::get(SrcScalarTy, VL.size() + E->getNumPadding());
}
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
if (BWSz == SrcBWSz) {
@@ -8299,10 +8397,19 @@ BoUpSLP::getEntryCost(con...
[truncated]
|
Thanks for the patch, but this is too early to land. I'm working on non-power-of-2 vectorization (it is WIP and still requires couple more patches to go). Being implemented without some extra patches it leads to perf regression in some cases. Need to handle all this stuff correctly. |
Do you have any more info about where those perf regressions are showing up (like architecture, benchmark, code pattern)? Curious if they are mostly related to reordering/shuffling/improved gathering? This patch does only support full gather, which should hopefully reflect the cost quite accurately (modulo cost-model gaps)
Done! Some of the tests are for profitability, but many of them are also to cover crashes. |
I investigated Spec + some other benchmarks, mostly the regressions are because of the too early SLP vectorization with LTO scenarios. To fix this, need to implement couple more patches (vectorization of gathered loads + reordering) + need to add some limitations.
|
Agreed, it turns out the new node is not really needed and non-power-of-2 vector ops are handled quite well directly by backends (checked AArch64). Adjusted the patch to drop the special TreeEntry, tracking of NumPadding and generate non-power-of-2 vector ops directly for all nodes in the tree, WDYT as a starting point? 0ddbdd3 |
As I said before, landing this alone causes perf regressions. I'm working on this, need to land couple patches to avoid this. I'm working on this feature for 3 years already (or more), most part of my previous patches are related to non-power-of-2 support. |
Improve cost computaton for odd vector mem ops by breaking them down into smaller power-of-2 parts and sum up the cost of those parts. This fixes the current cost estimates, which for most parts underestimated the cos, due to using getTypeLegalizationCost, which widens to the next power-of-2 in a single step in most cases. This doesn't reflect the actual cost. See https://llvm.godbolt.org/z/vMsnxMf1v for codegen for the tests. Note that there is a special case for v3i8, for which current codegen is pretty bad, due to automatic widening to v4i8, which in turn requires the conversion to go through memory ops in the stack. I am planning on fixing that as a follow-up, but I am not yet sure where to best fix this. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
I did perf evaluation for this change on SPEC2017 and some large internal benchmarks on AArch64, and the only regressions I was able to spot were due to AArch64 cost-model issues, where the cost of non-power-of-2 vector ops wasn't estimated accurately (one instance fixed by #78181). With the initial level of support, do you think there are fundamental issues in SLPVectorizer that aren't related to target specific cost models not providing accurate costs? To iterate on the latter, having limited support early would be helpful as people can fix their cost models early and also iteratively as more features enabled for non-power-of-2 vec. |
It contradicts with my work, I did for several years. I have a plan of implementing this feature and moving forward step by step. Most of this newly added stuff, like padding etc., is not needed, if everything is implemented correctly. This is not a complete solution, but just a small hack to enable the feature, which may cause issues in future and adds extra burden to maintaining and removing it later. |
Agreed that the padding isn't needed, did you see 0ddbdd3052e694eedef6bccce3b17f92dff7add7 mentioned in one of my previous comment that removes all the padding stuff? There are still a few things that can be cleaned up there, but with that removed, the changes mostly boil down to disabling shuffling/reordering, as most of that code isn't able to handle non-power-of-2 vectors yet. As support is added for those, the restrictions could be loosened gradually. |
Improve cost computaton for odd vector mem ops by breaking them down into smaller power-of-2 parts and sum up the cost of those parts. This fixes the current cost estimates, which for most parts underestimated the cos, due to using getTypeLegalizationCost, which widens to the next power-of-2 in a single step in most cases. This doesn't reflect the actual cost. See https://llvm.godbolt.org/z/vMsnxMf1v for codegen for the tests. Note that there is a special case for v3i8, for which current codegen is pretty bad, due to automatic widening to v4i8, which in turn requires the conversion to go through memory ops in the stack. I am planning on fixing that as a follow-up, but I am not yet sure where to best fix this. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: #77790 PR: #78181
Improve cost computaton for odd vector mem ops by breaking them down into smaller power-of-2 parts and sum up the cost of those parts. This fixes the current cost estimates, which for most parts underestimated the cos, due to using getTypeLegalizationCost, which widens to the next power-of-2 in a single step in most cases. This doesn't reflect the actual cost. See https://llvm.godbolt.org/z/vMsnxMf1v for codegen for the tests. Note that there is a special case for v3i8, for which current codegen is pretty bad, due to automatic widening to v4i8, which in turn requires the conversion to go through memory ops in the stack. I am planning on fixing that as a follow-up, but I am not yet sure where to best fix this. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 PR: llvm#78181 (cherry-picked from e473daa)
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, swiftlang#2] ldrh wY, [x0] orr wX, wY, wX, lsl swiftlang#16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence of 3 ST1.b, but first converting the truncate operand to either v8i8 or v16i8, extracting the lanes for the truncate results and storing them. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790
Improve cost computaton for odd vector mem ops by breaking them down into smaller power-of-2 parts and sum up the cost of those parts. This fixes the current cost estimates, which for most parts underestimated the cos, due to using getTypeLegalizationCost, which widens to the next power-of-2 in a single step in most cases. This doesn't reflect the actual cost. See https://llvm.godbolt.org/z/vMsnxMf1v for codegen for the tests. Note that there is a special case for v3i8, for which current codegen is pretty bad, due to automatic widening to v4i8, which in turn requires the conversion to go through memory ops in the stack. I am planning on fixing that as a follow-up, but I am not yet sure where to best fix this. At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm#77790 PR: llvm#78181
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated the PR with the version without padding, including addressing @alexey-bataev 's comments at 0ddbdd3052e694eedef6bccce3b17f92dff7add7
; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4 | ||
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4 | ||
; NON-POW2-NEXT: [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> | ||
; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Moved comment here 0ddbdd3#r137302312)
Is this supported on all platforms? Or you rely on the check that this will end up with the scalarized version in the end?
I checked on AArch64 and X86 and both support OK lowering of such operations, although in a number of cases improvements can be made, e.g. #78632 and #78637 for loads and stores of <3 x i8> on AArch64.
This effectively relies on the cost-models to return accurate costs for non-power-of-2 vector operations. If a vector op of a non-power-of-2 type will get scalarizied in the backend, the cost model should return a high cost.
Ping :) |
✅ With the latest revision this PR passed the Python code formatter. |
ping |
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { | ||
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " | ||
"for nodes with padding.\n"); | ||
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); | ||
return false; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to move this check and one below to line (or after) 6399 in the original source code to avoid duplication.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I moved the check up the beginning of the else {
in the lambada to avoid the duplication. I think it cannot be moved out of TryToFindDuplicates
, as we only need to have the bail out when reshuffling would be needed.
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops. | ||
if (!isPowerOf2_32(VL.size())) | ||
return TreeEntry::NeedToGather; | ||
if ((Reuse || !CurrentOrder.empty())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove extra parens, restore original check here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -6111,6 +6141,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, | |||
if (NumUniqueScalarValues == VL.size()) { | |||
ReuseShuffleIndicies.clear(); | |||
} else { | |||
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add FIXME here for non-power-of-2 support
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { | ||
NonPowerOf2VF = CandVF; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { | |
NonPowerOf2VF = CandVF; | |
} | |
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) | |
NonPowerOf2VF = CandVF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -14798,14 +14843,23 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, | |||
continue; | |||
} | |||
|
|||
std::optional<unsigned> NonPowerOf2VF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::optional<unsigned> NonPowerOf2VF; | |
unsigned NonPowerOf2VF = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
for_each(CandidateVFs, [&](unsigned &VF) { | ||
VF = Size; | ||
Size /= 2; | ||
SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); | |
SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0)); |
Better to avoid adding bool
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); | ||
unsigned Size = MinVF; | ||
for_each(reverse(CandidateVFs), [&](unsigned &VF) { | ||
VF = Size > MaxVF ? *NonPowerOf2VF : Size; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VF = Size > MaxVF ? *NonPowerOf2VF : Size; | |
VF = Size > MaxVF ? NonPowerOf2VF : Size; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Latest comments should be addressed, thanks!
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops. | ||
if (!isPowerOf2_32(VL.size())) | ||
return TreeEntry::NeedToGather; | ||
if ((Reuse || !CurrentOrder.empty())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -6111,6 +6141,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, | |||
if (NumUniqueScalarValues == VL.size()) { | |||
ReuseShuffleIndicies.clear(); | |||
} else { | |||
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
@@ -14798,14 +14843,23 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, | |||
continue; | |||
} | |||
|
|||
std::optional<unsigned> NonPowerOf2VF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { | ||
NonPowerOf2VF = CandVF; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
for_each(CandidateVFs, [&](unsigned &VF) { | ||
VF = Size; | ||
Size /= 2; | ||
SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); | ||
unsigned Size = MinVF; | ||
for_each(reverse(CandidateVFs), [&](unsigned &VF) { | ||
VF = Size > MaxVF ? *NonPowerOf2VF : Size; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
@@ -2806,6 +2810,9 @@ class BoUpSLP { | |||
SmallVectorImpl<Value *> *OpScalars = nullptr, | |||
SmallVectorImpl<Value *> *AltScalars = nullptr) const; | |||
|
|||
/// Return true if this is a non-power-of-2 node. | |||
bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if ReuseShuffleIndices is not empty? Will it work?Can you add a test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All code paths should guard against that AFAICT. I added an assertion to make sure. Couldn't find any test case that triggers this across large code bases (SPEC2006, SPEC2017, llvm-test-suite, clang bootstrap and large internal benchmarks)
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
@fhahn This PR breaks our CI: dtcxzyw/llvm-opt-benchmark#514
Reduced test case:
|
@dtcxzyw thanks for the reproducer, the issue was that in some cases on X86, MinVF wasn't a power-of-2. Fixed by using the stored type to compute the VF passed to |
This patch enables vectorization for non-power-of-2 VFs. Initially only VFs where adding 1 makes the VF a power-of-2, i.e. we can still make relatively effective use of the vectors. It relies on the existing target cost-models to return accurate costs for non-power-of-2 vectors. I checked mostly AArch64 and X86 and there the costs seem reasonable for the costs I checked, although I expect there will be a need to refine both the cost-models and lowering to make most effective use of non-power-of-2 SLP vectorization. Note that re-ordering and shuffling is not implemented for nodes requiring padding yet to keep the initial implementation simpler. The feature is guarded by a new flag, off by defaul for now. PR: llvm#77790
SLP supports non-power-of-2 vectors [1], so we should consider supporting this for RISC-V vector code generation. It is natural to support non-power-of-2 VLS vectors for the vector extension, as VL does not impose any constraints on this. In theory, we could support any length, but we want to prevent the number of MVTs from growing too quickly. Therefore, we only add v3, v5, v7 and v15. [1] llvm#77790
This patch enables vectorization for non-power-of-2 VFs. Initially only
VFs where adding 1 makes the VF a power-of-2, i.e. we can still make
relatively effective use of the vectors.
It relies on the existing target cost-models to return accurate costs for
non-power-of-2 vectors. I checked mostly AArch64 and X86 and
there the costs seem reasonable for the costs I checked, although
I expect there will be a need to refine both the cost-models and lowering
to make most effective use of non-power-of-2 SLP vectorization.
Note that re-ordering and shuffling is not implemented for nodes
requiring padding yet to keep the initial implementation simpler.
The feature is guarded by a new flag, off by defaul for now.