Skip to content

Commit 0b38af8

Browse files
committed
[AArch64] match splat of bitcasted extract subvector to DUPLANE
This is another potential regression exposed by D63815. Here we peek through a bitcast to find an extract subvector and scale the splat offset based on that: splat (bitcast (extract X, C)), LaneC --> duplane (bitcast X), LaneC' Differential Revision: https://reviews.llvm.org/D71672
1 parent 218601a commit 0b38af8

File tree

2 files changed

+51
-15
lines changed

2 files changed

+51
-15
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+43-7
Original file line numberDiff line numberDiff line change
@@ -7086,19 +7086,55 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
70867086
// Otherwise, duplicate from the lane of the input vector.
70877087
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
70887088

7089-
// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
7090-
// to make a vector of the same size as this SHUFFLE. We can ignore the
7091-
// extract entirely, and canonicalise the concat using WidenVector.
7092-
if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7093-
Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
7089+
// Try to eliminate a bitcasted extract subvector before a DUPLANE.
7090+
auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
7091+
// Match: dup (bitcast (extract_subv X, C)), LaneC
7092+
if (BitCast.getOpcode() != ISD::BITCAST ||
7093+
BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
7094+
return false;
7095+
7096+
// The extract index must align in the destination type. That may not
7097+
// happen if the bitcast is from narrow to wide type.
7098+
SDValue Extract = BitCast.getOperand(0);
7099+
unsigned ExtIdx = Extract.getConstantOperandVal(1);
7100+
unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
7101+
unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
7102+
unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
7103+
if (ExtIdxInBits % CastedEltBitWidth != 0)
7104+
return false;
7105+
7106+
// Update the lane value by offsetting with the scaled extract index.
7107+
LaneC += ExtIdxInBits / CastedEltBitWidth;
7108+
7109+
// Determine the casted vector type of the wide vector input.
7110+
// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
7111+
// Examples:
7112+
// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
7113+
// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
7114+
unsigned SrcVecNumElts =
7115+
Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
7116+
CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
7117+
SrcVecNumElts);
7118+
return true;
7119+
};
7120+
MVT CastVT;
7121+
if (getScaledOffsetDup(V1, Lane, CastVT)) {
7122+
V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
7123+
} else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7124+
// The lane is incremented by the index of the extract.
7125+
// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
7126+
Lane += V1.getConstantOperandVal(1);
70947127
V1 = V1.getOperand(0);
70957128
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
7129+
// The lane is decremented if we are splatting from the 2nd operand.
7130+
// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
70967131
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
70977132
Lane -= Idx * VT.getVectorNumElements() / 2;
70987133
V1 = WidenVector(V1.getOperand(Idx), DAG);
7099-
} else if (VT.getSizeInBits() == 64)
7134+
} else if (VT.getSizeInBits() == 64) {
7135+
// Widen the operand to 128-bit register with undef.
71007136
V1 = WidenVector(V1, DAG);
7101-
7137+
}
71027138
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
71037139
}
71047140

llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll

+8-8
Original file line numberDiff line numberDiff line change
@@ -1663,8 +1663,7 @@ entry:
16631663
define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) {
16641664
; CHECK-LABEL: test_vmul_laneq3_f32_bitcast:
16651665
; CHECK: // %bb.0:
1666-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
1667-
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1]
1666+
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[3]
16681667
; CHECK-NEXT: ret
16691668
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
16701669
%bc = bitcast <1 x double> %extract to <2 x float>
@@ -1676,8 +1675,7 @@ define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v
16761675
define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) {
16771676
; CHECK-LABEL: test_vmul_laneq2_f32_bitcast:
16781677
; CHECK: // %bb.0:
1679-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
1680-
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0]
1678+
; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[2]
16811679
; CHECK-NEXT: ret
16821680
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
16831681
%bc = bitcast <1 x double> %extract to <2 x float>
@@ -1689,8 +1687,7 @@ define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v
16891687
define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
16901688
; CHECK-LABEL: test_vadd_laneq5_i16_bitcast:
16911689
; CHECK: // %bb.0:
1692-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
1693-
; CHECK-NEXT: dup v1.4h, v1.h[1]
1690+
; CHECK-NEXT: dup v1.4h, v1.h[5]
16941691
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
16951692
; CHECK-NEXT: ret
16961693
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
@@ -1700,6 +1697,8 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
17001697
ret <4 x i16> %r
17011698
}
17021699

1700+
; TODO: The pattern in LowerVECTOR_SHUFFLE does not match what we are looking for.
1701+
17031702
define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
17041703
; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned:
17051704
; CHECK: // %bb.0:
@@ -1717,8 +1716,7 @@ define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
17171716
define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
17181717
; CHECK-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned:
17191718
; CHECK: // %bb.0:
1720-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
1721-
; CHECK-NEXT: dup v1.4h, v1.h[1]
1719+
; CHECK-NEXT: dup v1.4h, v1.h[5]
17221720
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
17231721
; CHECK-NEXT: ret
17241722
%extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1728,6 +1726,8 @@ define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
17281726
ret <4 x i16> %r
17291727
}
17301728

1729+
; Negative test - can't dup bytes {3,4} of v8i16.
1730+
17311731
define <4 x i16> @test_vadd_lane_i16_bitcast_bigger_unaligned(<4 x i16> %a, <16 x i8> %v) {
17321732
; CHECK-LABEL: test_vadd_lane_i16_bitcast_bigger_unaligned:
17331733
; CHECK: // %bb.0:

0 commit comments

Comments
 (0)