-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] Add foldShuffleToIdentity #88693
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis patch adds a basic version of a combine that attempts to remove shuffles that when combined simplify away to an identity shuffle. For example: The code tracks each lane starting from the original shuffle, keeping a track of a vector of {src, idx}. As we propagate up through the instructions we will either look through intermediate instructions (binops and unops) or see a collections of lanes that all have the same src and incrementing idx (an identity). We can also see a single value with identical lanes, which we can treat like a splat. Only the basic version is added here, handling identities, splats, binops and unops. In follow-up patches other instructions can be added such as constants, intrinsics, cmp/sel and zext/sext/trunc. Patch is 27.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88693.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e0e2f50c89adad..12e37f67d8b264 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -113,6 +113,7 @@ class VectorCombine {
bool scalarizeLoadExtract(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
+ bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
bool foldTruncFromReductions(Instruction &I);
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1547,6 +1548,145 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
return true;
}
+// Starting from a shuffle, look up through operands tracking the shuffled index
+// of each lane. If we can simplify away the shuffles to identities then
+// do so.
+bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
+ FixedVectorType *Ty = dyn_cast<FixedVectorType>(I.getType());
+ if (!Ty || !isa<Instruction>(I.getOperand(0)) ||
+ !isa<Instruction>(I.getOperand(1)))
+ return false;
+
+ using InstLane = std::pair<Value *, int>;
+
+ auto LookThroughShuffles = [](Value *V, int Lane) -> InstLane {
+ while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ unsigned NumElts =
+ cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
+ int M = SV->getMaskValue(Lane);
+ if (M < 0)
+ return {nullptr, -1};
+ else if (M < (int)NumElts) {
+ V = SV->getOperand(0);
+ Lane = M;
+ } else {
+ V = SV->getOperand(1);
+ Lane = M - NumElts;
+ }
+ }
+ return InstLane{V, Lane};
+ };
+
+ auto GenerateInstLaneVectorFromOperand =
+ [&LookThroughShuffles](const SmallVector<InstLane> &Item, int Op) {
+ SmallVector<InstLane> NItem;
+ for (InstLane V : Item) {
+ NItem.emplace_back(
+ !V.first
+ ? InstLane{nullptr, -1}
+ : LookThroughShuffles(
+ cast<Instruction>(V.first)->getOperand(Op), V.second));
+ }
+ return NItem;
+ };
+
+ SmallVector<InstLane> Start;
+ for (unsigned M = 0; M < Ty->getNumElements(); ++M)
+ Start.push_back(LookThroughShuffles(&I, M));
+
+ SmallVector<SmallVector<InstLane>> Worklist;
+ Worklist.push_back(Start);
+ SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
+
+ while (!Worklist.empty()) {
+ SmallVector<InstLane> Item = Worklist.pop_back_val();
+
+ // If we found an undef first lane then bail out to keep things simple.
+ if (!Item[0].first)
+ return false;
+
+ // Look for an identity value.
+ if (Item[0].second == 0 && Item[0].first->getType() == Ty &&
+ all_of(drop_begin(enumerate(Item)), [&](const auto &E) {
+ return !E.value().first || (E.value().first == Item[0].first &&
+ E.value().second == (int)E.index());
+ })) {
+ IdentityLeafs.insert(Item[0].first);
+ continue;
+ }
+ // Look for a splat value.
+ if (all_of(drop_begin(Item), [&](InstLane &IL) {
+ return !IL.first ||
+ (IL.first == Item[0].first && IL.second == Item[0].second);
+ })) {
+ SplatLeafs.insert(Item[0].first);
+ continue;
+ }
+
+ // We need each element to be the same type of value, and check that each
+ // element has a single use.
+ if (!all_of(drop_begin(Item), [&](InstLane IL) {
+ if (!IL.first)
+ return true;
+ if (isa<Instruction>(IL.first) &&
+ !cast<Instruction>(IL.first)->hasOneUse())
+ return false;
+ return IL.first->getValueID() == Item[0].first->getValueID() &&
+ (!isa<IntrinsicInst>(IL.first) ||
+ cast<IntrinsicInst>(IL.first)->getIntrinsicID() ==
+ cast<IntrinsicInst>(Item[0].first)->getIntrinsicID());
+ }))
+ return false;
+
+ // Check the operator is one that we support.
+ if (isa<BinaryOperator>(Item[0].first)) {
+ Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0));
+ Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 1));
+ } else if (isa<UnaryOperator>(Item[0].first)) {
+ Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0));
+ } else {
+ return false;
+ }
+ }
+
+ // If we got this far, we know the shuffles are superfluous and can be
+ // removed. Scan through again and generate the new tree of instructions.
+ std::function<Value *(const SmallVector<InstLane> &)> generate =
+ [&](const SmallVector<InstLane> &Item) -> Value * {
+ if (IdentityLeafs.contains(Item[0].first) &&
+ all_of(drop_begin(enumerate(Item)), [&](const auto &E) {
+ return !E.value().first || (E.value().first == Item[0].first &&
+ E.value().second == (int)E.index());
+ })) {
+ return Item[0].first;
+ } else if (SplatLeafs.contains(Item[0].first)) {
+ if (auto ILI = dyn_cast<Instruction>(Item[0].first))
+ Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef());
+ else if (isa<Argument>(Item[0].first))
+ Builder.SetInsertPointPastAllocas(I.getParent()->getParent());
+ SmallVector<int, 16> Mask(Ty->getNumElements(), Item[0].second);
+ return Builder.CreateShuffleVector(Item[0].first, Mask);
+ }
+
+ auto *I = cast<Instruction>(Item[0].first);
+ SmallVector<Value *> Ops;
+ unsigned E = I->getNumOperands();
+ for (unsigned Idx = 0; Idx < E; Idx++)
+ Ops.push_back(generate(GenerateInstLaneVectorFromOperand(Item, Idx)));
+ Builder.SetInsertPoint(I);
+ if (auto BI = dyn_cast<BinaryOperator>(I))
+ return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
+ Ops[0], Ops[1]);
+ if (auto UI = dyn_cast<UnaryOperator>(I))
+ return Builder.CreateUnOp((Instruction::UnaryOps)UI->getOpcode(), Ops[0]);
+ llvm_unreachable("Unhandled instruction in generate");
+ };
+
+ Value *V = generate(Start);
+ replaceValue(I, *V);
+ return true;
+}
+
/// Given a commutative reduction, the order of the input lanes does not alter
/// the results. We can use this to remove certain shuffles feeding the
/// reduction, removing the need to shuffle at all.
@@ -2103,6 +2243,7 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfBinops(I);
MadeChange |= foldShuffleOfCastops(I);
MadeChange |= foldSelectShuffle(I);
+ MadeChange |= foldShuffleToIdentity(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index d96dfec849167d..47f52a341df24f 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -5,10 +5,7 @@ target triple = "aarch64"
define <8 x i8> @trivial(<8 x i8> %a) {
; CHECK-LABEL: @trivial(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[AT]], <4 x i8> [[AB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: ret <8 x i8> [[R]]
+; CHECK-NEXT: ret <8 x i8> [[R:%.*]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%at = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
@@ -18,13 +15,7 @@ define <8 x i8> @trivial(<8 x i8> %a) {
define <8 x i8> @add(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: @add(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT: [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -83,13 +74,7 @@ define <8 x i8> @wrong_lanes(<8 x i8> %a, <8 x i8> %b) {
define <8 x half> @fadd(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @fadd(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BT]]
-; CHECK-NEXT: [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BB]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -104,11 +89,7 @@ define <8 x half> @fadd(<8 x half> %a, <8 x half> %b) {
define <8 x half> @fneg(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @fneg(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = fneg <4 x half> [[AT]]
-; CHECK-NEXT: [[ABB:%.*]] = fneg <4 x half> [[AB]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = fneg <8 x half> [[A:%.*]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -138,12 +119,8 @@ define <8 x i8> @abs(<8 x i8> %a) {
define <8 x half> @splat0(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @splat0(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT: [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[TMP1]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -157,12 +134,8 @@ define <8 x half> @splat0(<8 x half> %a, <8 x half> %b) {
define <8 x half> @splat2(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @splat2(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT: [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[A:%.*]], [[TMP1]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -176,12 +149,8 @@ define <8 x half> @splat2(<8 x half> %a, <8 x half> %b) {
define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @splatandidentity(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BS:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[ABT:%.*]] = fadd <4 x half> [[AT]], [[BS]]
-; CHECK-NEXT: [[ABB:%.*]] = fadd <4 x half> [[AB]], [[BS]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[A]], [[TMP1]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -195,11 +164,9 @@ define <8 x half> @splatandidentity(<8 x half> %a, <8 x half> %b) {
define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: @splattwice(
-; CHECK-NEXT: [[AS:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BS:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[AB1:%.*]] = fadd <4 x half> [[AS]], [[BS]]
-; CHECK-NEXT: [[AB2:%.*]] = fadd <4 x half> [[AS]], [[BS]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[AB1]], <4 x half> [[AB2]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[TMP2]], [[TMP1]]
; CHECK-NEXT: ret <8 x half> [[R]]
;
%as = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> zeroinitializer
@@ -212,13 +179,7 @@ define <8 x half> @splattwice(<8 x half> %a, <8 x half> %b) {
define <8 x i8> @undeflane(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: @undeflane(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT: [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 poison, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -284,18 +245,9 @@ define <8 x i8> @constantdiff2(<8 x i8> %a) {
define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
; CHECK-LABEL: @inner_shuffle(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[CS:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[ABT:%.*]] = mul <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT: [[ABB:%.*]] = mul <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT: [[ABT2:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[ABB2:%.*]] = shufflevector <4 x i8> [[ABB]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[ABT3:%.*]] = add <4 x i8> [[ABT2]], [[CS]]
-; CHECK-NEXT: [[ABB3:%.*]] = add <4 x i8> [[ABB2]], [[CS]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[R:%.*]] = add <8 x i8> [[TMP2]], [[TMP1]]
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -338,14 +290,9 @@ define <8 x i8> @extrause_add(<8 x i8> %a, <8 x i8> %b) {
define <8 x i8> @extrause_shuffle(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: @extrause_shuffle(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B1:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT1:%.*]] = shufflevector <8 x i8> [[B1]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT: [[BT1:%.*]] = shufflevector <8 x i8> [[B1:%.*]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
; CHECK-NEXT: call void @use(<4 x i8> [[BT1]])
-; CHECK-NEXT: [[ABT:%.*]] = add <4 x i8> [[BT]], [[BT1]]
-; CHECK-NEXT: [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[R:%.*]] = add <8 x i8> [[A:%.*]], [[B1]]
; CHECK-NEXT: ret <8 x i8> [[R]]
;
%ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -543,52 +490,15 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
; CHECK-LABEL: @v8f64interleave(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT: [[STRIDE...
[truncated]
|
Funnily enough, I have started work on a VectorCombine::foldShuffleOfShuffles fold that will sort of do this (and catch many of these test cases):
But it doesn't recurse, so every fold has to be cost beneficial - are you seeing cases where we need the recursion capability? |
Hi - I get the feeling that X86 has generally better shuffle support than other architectures. Even for AArch64, which generally does OK, there can be a big difference between a low-cost shuffle and a bad one. For something like MVE which is more constrained, the differences can be a lot larger often falling back to scalarization where it can't do much better. It means we can't really take local steps and get to the optimal solution, because each step on it's own is worse for performance. It's only taken together that we end up with something better. It is not the motivating case but one that can come up a fair amount - consider the LD4/ST4 vectorization that we do for interleaving loads/stores. The load is various shuffle(loads) and the store a store(shuffle(shuffle, shuffle)). If the intervening instructions are all the same then the whole thing can be simplified away to continuous loads/stores, but a single shuffle moved only breaks the pattern for LD4/ST4. I think the v8f64interleave test shows something similar, but it involves splat'd vectors and wider interleaving groups. I wanted to add some recursion limit to this code, to make sure it doesn't go too wrong. I will try and look into why #88743 causes regressions too. |
Yes - we've never come up with a good way to accumulate candidate folds and only commit them if/when it becomes cost effective. Please don't think I want to just take #88743 + #88899 instead of this, I think they're both needed, I just want to make sure we don't end up with a heavy fold constantly doing too much of the combining. |
The trivial cases were just added for testing - many of them are already handled by instcombine. Some of those combines in instcombine are the ones causing the problem, as they already move shuffles through binops without considering the costs and breaking the canonical patterns for interleaving loads. We were previously trying to fix that in the InterleavedAccess (through replaceBinOpShuffles), but that starts to become impossible as more and more folds happen in the mid-end. That is separate from this patch though, which shouldn't create the same problems of breaking interleaving patterns without replacing them with identity shuffles, which should always be an improvement. |
37dda86
to
3aacf81
Compare
// of each lane. If we can simplify away the shuffles to identities then | ||
// do so. | ||
bool VectorCombine::foldShuffleToIdentity(Instruction &I) { | ||
FixedVectorType *Ty = dyn_cast<FixedVectorType>(I.getType()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FixedVectorType *Ty = dyn_cast<FixedVectorType>(I.getType()); | |
auto *Ty = dyn_cast<FixedVectorType>(I.getType()); |
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements(); | ||
int M = SV->getMaskValue(Lane); | ||
if (M < 0) | ||
return {nullptr, -1}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return {nullptr, -1}; | |
return {nullptr, PoisonMaskElem}; |
}; | ||
|
||
auto GenerateInstLaneVectorFromOperand = | ||
[&LookThroughShuffles](const SmallVector<InstLane> &Item, int Op) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[&LookThroughShuffles](const SmallVector<InstLane> &Item, int Op) { | |
[&LookThroughShuffles](ArrayRef<InstLane> Item, int Op) { |
for (InstLane V : Item) { | ||
NItem.emplace_back( | ||
!V.first | ||
? InstLane{nullptr, -1} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
? InstLane{nullptr, -1} | |
? InstLane{nullptr, PoisonMaskElem} |
if (!all_of(drop_begin(Item), [&](InstLane IL) { | ||
if (!IL.first) | ||
return true; | ||
if (isa<Instruction>(IL.first) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use dyn_cast
|
||
// If we got this far, we know the shuffles are superfluous and can be | ||
// removed. Scan through again and generate the new tree of instructions. | ||
std::function<Value *(const SmallVector<InstLane> &)> generate = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::function<Value *(const SmallVector<InstLane> &)> generate = | |
auto Generate = |
// If we got this far, we know the shuffles are superfluous and can be | ||
// removed. Scan through again and generate the new tree of instructions. | ||
std::function<Value *(const SmallVector<InstLane> &)> generate = | ||
[&](const SmallVector<InstLane> &Item) -> Value * { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[&](const SmallVector<InstLane> &Item) -> Value * { | |
[&](ArrayRef<InstLane> Item) -> Value * { |
93685b2
to
7babf9e
Compare
if (auto *I = dyn_cast<Instruction>(IL.first); I && !I->hasOneUse()) | ||
return false; | ||
return IL.first->getValueID() == Item[0].first->getValueID() && | ||
(!isa<IntrinsicInst>(IL.first) || | ||
cast<IntrinsicInst>(IL.first)->getIntrinsicID() == | ||
cast<IntrinsicInst>(Item[0].first)->getIntrinsicID()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto *I = dyn_cast<Instruction>(IL.first); I && !I->hasOneUse()) | |
return false; | |
return IL.first->getValueID() == Item[0].first->getValueID() && | |
(!isa<IntrinsicInst>(IL.first) || | |
cast<IntrinsicInst>(IL.first)->getIntrinsicID() == | |
cast<IntrinsicInst>(Item[0].first)->getIntrinsicID()); | |
auto *I = dyn_cast<Instruction>(IL.first); | |
if (I && !I->hasOneUse()) | |
return false; | |
return IL.first->getValueID() == Item[0].first->getValueID() && | |
(!I || I->getIntrinsicID() == | |
cast<IntrinsicInst>(Item[0].first)->getIntrinsicID()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've changed it to something like this. The second one is an IntrinsicInst, to make sure the types of the intrinsics are the same.
std::function<Value *(ArrayRef<InstLane>)> generate = | ||
[&](ArrayRef<InstLane> Item) -> Value * { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::function<Value *(ArrayRef<InstLane>)> generate = | |
[&](ArrayRef<InstLane> Item) -> Value * { | |
auto Generate = | |
[&](ArrayRef<InstLane> Item) -> Value * { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is called inside the lambda, so needs a type up-front.
if (auto UI = dyn_cast<UnaryOperator>(I)) | ||
return Builder.CreateUnOp((Instruction::UnaryOps)UI->getOpcode(), Ops[0]); | ||
llvm_unreachable("Unhandled instruction in generate"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto UI = dyn_cast<UnaryOperator>(I)) | |
return Builder.CreateUnOp((Instruction::UnaryOps)UI->getOpcode(), Ops[0]); | |
llvm_unreachable("Unhandled instruction in generate"); | |
assert(isa<UnaryOperator>(I) && "Expected either Unary or Binary Op."); | |
return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was planning to add a number of other types added here in follow patches. This one just contains the basics to make sure the general ideal is working OK. I can make it so that UnaryInstructions are left as the else (even if I personally prefer the structure of keeping them all consistent :) I find it more readable. )
7babf9e
to
839e2c1
Compare
839e2c1
to
7417cd3
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you intend to support intrinsics at some point? I notice you have a abs test
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[AB1]], <4 x half> [[AB2]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> | ||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer | ||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer | ||
; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[TMP2]], [[TMP1]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like we might need a foldBinopOfShuffles as well......
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All these tests were just added as examples for testing this combine, many of the smaller ones like this are already optimized by instcombine. They were only added to make sure we were testing the different cases that can come up when folding the shuffles to identities.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you intend to support intrinsics at some point? I notice you have a abs test
Yep, those and a few other operations. I wanted to make sure the basics were working before adding the others but they are hopefully relatively straight forward.
if (auto *I = dyn_cast<Instruction>(IL.first); I && !I->hasOneUse()) | ||
return false; | ||
return IL.first->getValueID() == Item[0].first->getValueID() && | ||
(!isa<IntrinsicInst>(IL.first) || | ||
cast<IntrinsicInst>(IL.first)->getIntrinsicID() == | ||
cast<IntrinsicInst>(Item[0].first)->getIntrinsicID()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've changed it to something like this. The second one is an IntrinsicInst, to make sure the types of the intrinsics are the same.
std::function<Value *(ArrayRef<InstLane>)> generate = | ||
[&](ArrayRef<InstLane> Item) -> Value * { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is called inside the lambda, so needs a type up-front.
if (auto UI = dyn_cast<UnaryOperator>(I)) | ||
return Builder.CreateUnOp((Instruction::UnaryOps)UI->getOpcode(), Ops[0]); | ||
llvm_unreachable("Unhandled instruction in generate"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was planning to add a number of other types added here in follow patches. This one just contains the basics to make sure the general ideal is working OK. I can make it so that UnaryInstructions are left as the else (even if I personally prefer the structure of keeping them all consistent :) I find it more readable. )
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[AB1]], <4 x half> [[AB2]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> | ||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer | ||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer | ||
; CHECK-NEXT: [[R:%.*]] = fadd <8 x half> [[TMP2]], [[TMP1]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All these tests were just added as examples for testing this combine, many of the smaller ones like this are already optimized by instcombine. They were only added to make sure we were testing the different cases that can come up when folding the shuffles to identities.
7417cd3
to
26000de
Compare
Rebased and ping. Thanks |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Final question - do we need handling for DivRem instructions to make sure we don't introduce poison mask elements? see 4cc9c6d
This patch adds a basic version of a combine that attempts to fold away shuffles that when combines simplify away to an identity shuffle. For example: %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> %at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4> %abt = fneg <4 x half> %at %abb = fneg <4 x half> %ab %r = shufflevector <4 x half> %abt, <4 x half> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> By looking through the shuffles, it can be simplified to: %r = fneg <8 x half> %a The code tracks each lane starting from the original shuffle, keeping a track of a vector of {src, idx}. As we propagate up through the instructions we will either look through intermediate instructions (binops and unops) or see a collections of lanes that all have the same src and incrementing idx (an identity). We can also see a single value with identical lanes, which we can treat like a splat. Only the basic version is added here, handling identites, splats, binops and unops. In follow-up patches other instructions can be added such as constants, intrinsics, cmp/sel and zext/sext/trunc.
Thanks - I hadn't considered div/rem as they don't as often come up for vectors. I can exclude them for now if that sounds OK, but as far as I understand they might be fine as we end up removing the shuffles so do not have poison lanes left over. |
26000de
to
01af3d6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Hi @davemgreen, we are seeing an assertion failure in a few internal tests which I bisected back to your change. I have filed #91078 for the issue, can you take a look? |
Thanks for the heads up - I'll take a look. |
Hi @davemgreen, I'm seeing an assertion failure in |
Hi - yeah I'll take a look. Thanks for letting me know. |
This patch adds a basic version of a combine that attempts to remove shuffles that when combined simplify away to an identity shuffle. For example:
%ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%at = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
%abt = fneg <4 x half> %at
%abb = fneg <4 x half> %ab
%r = shufflevector <4 x half> %abt, <4 x half> %abb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
By looking through the shuffles and fneg, it can be simplified to:
%r = fneg <8 x half> %a
The code tracks each lane starting from the original shuffle, keeping a track of a vector of {src, idx}. As we propagate up through the instructions we will either look through intermediate instructions (binops and unops) or see a collections of lanes that all have the same src and incrementing idx (an identity). We can also see a single value with identical lanes, which we can treat like a splat.
Only the basic version is added here, handling identities, splats, binops and unops. In follow-up patches other instructions can be added such as constants, intrinsics, cmp/sel and zext/sext/trunc.