Skip to content

Commit

Permalink
[VectorCombine] foldShuffleOfBinops - fold shuffle(binop(shuffle(x),s…
Browse files Browse the repository at this point in the history
…huffle(z)),binop(shuffle(y),shuffle(w)) -> binop(shuffle(x,z),shuffle(y,w)) (llvm#120984)

Some patterns (in particular horizontal style patterns) can end up with shuffles straddling both sides of a binop/cmp.

Where individually the folds aren't worth it, by merging the (oneuse) shuffles we can notably reduce the net instruction count and cost.

One of the final steps towards finally addressing llvm#34072
  • Loading branch information
RKSimon authored Jan 3, 2025
1 parent cdad183 commit e3ec5a7
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 146 deletions.
34 changes: 32 additions & 2 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1743,6 +1743,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);

// Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
// where one use shuffles have gotten split across the binop/cmp. These
// often allow a major reduction in total cost that wouldn't happen as
// individual folds.
auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
TTI::TargetCostKind CostKind) -> bool {
Value *InnerOp;
ArrayRef<int> InnerMask;
if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
m_Mask(InnerMask)))) &&
InnerOp->getType() == Op->getType() &&
all_of(InnerMask,
[NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
for (int &M : Mask)
if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
M = InnerMask[M - Offset];
M = 0 <= M ? M + Offset : M;
}
OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
Op = InnerOp;
return true;
}
return false;
};
bool ReducedInstCount = false;
ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);

InstructionCost NewCost =
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
Expand All @@ -1763,8 +1793,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {

// If either shuffle will constant fold away, then fold for the same cost as
// we will reduce the instruction count.
bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
(isa<Constant>(Y) && isa<Constant>(W));
ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
(isa<Constant>(Y) && isa<Constant>(W));
if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
return false;

Expand Down
Loading

0 comments on commit e3ec5a7

Please sign in to comment.