[X86] use VPMADDWD for widening adjacent addition#174149
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Folkert de Vries (folkertdev) ChangesThe In the rust standard library, we like to implement intrinsics in terms of simpler building blocks, so that all backends can implement a small set of primitives instead of supporting all of LLVM's intrinsics. When we try that for This PR recognizes the widening adjacent addition pattern that adler32 uses directly, and manually inserts a trivial multiplication by an all-ones vector. Experimentally, performing this optimization increases adler32 throughput from 41 gb/s to 67 gb/s (rust-lang/rust#150560 (comment)) cc rust-lang/stdarch#1985 rust-lang/rust#150560 Full diff: https://github.com/llvm/llvm-project/pull/174149.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 50df19b3e6e47..0bfbc3f47d12b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58033,7 +58033,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
// (extract_elt Mul, 3),
// (extract_elt Mul, 5),
// ...
- // and identify Mul.
+ // and identify Mul. Mul must be either ISD::MUL, or can be ISD::SIGN_EXTEND
+ // in which case we add a trivial multiplication by an all-ones vector.
SDValue Mul;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
@@ -58064,7 +58065,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
// with 2X number of vector elements than the BUILD_VECTOR.
// Both extracts must be from same MUL.
Mul = Vec0L;
- if (Mul.getOpcode() != ISD::MUL ||
+ if ((Mul.getOpcode() != ISD::MUL &&
+ Mul.getOpcode() != ISD::SIGN_EXTEND) ||
Mul.getValueType().getVectorNumElements() != 2 * e)
return SDValue();
}
@@ -58073,16 +58075,31 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
return SDValue();
}
- // Check if the Mul source can be safely shrunk.
- ShrinkMode Mode;
- if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
- Mode == ShrinkMode::MULU16)
- return SDValue();
+ SDValue N0, N1;
+ if (Mul.getOpcode() == ISD::MUL) {
+ // Check if the Mul source can be safely shrunk.
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
+ Mode == ShrinkMode::MULU16)
+ return SDValue();
+
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+ N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+ } else {
+ assert(Mul.getOpcode() == ISD::SIGN_EXTEND);
+
+ // Add a trivial multiplication with an all-ones vector so that we can make
+ // use of VPMADDWD.
+ N0 = Mul.getOperand(0);
+ EVT SrcVT = N0.getValueType();
+ N1 = DAG.getSplatVector(SrcVT, DL, DAG.getConstant(1, DL, MVT::i16));
- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements() * 2);
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i16 ||
+ SrcVT.getVectorNumElements() != 2 * VT.getVectorNumElements())
+ return SDValue();
+ }
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index d9283aa8591fc..53f1374669ca5 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -331,3 +331,32 @@ define i1 @pmaddwd_pcmpgt_infinite_loop() {
%8 = icmp eq i4 %7, 0
ret i1 %8
}
+
+; If the shuffle matches, but there is no multiply, introduce a trivial multiply by an all-ones vector.
+define <8 x i32> @introduce_trivial_multiply(<16 x i16> %x) {
+; SSE-LABEL: introduce_trivial_multiply:
+; SSE: # %bb.0:
+; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE-NEXT: pmaddwd %xmm2, %xmm0
+; SSE-NEXT: pmaddwd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: introduce_trivial_multiply:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: introduce_trivial_multiply:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: retq
+ %1 = sext <16 x i16> %x to <16 x i32>
+ %2 = shufflevector <16 x i32> %1, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %3 = shufflevector <16 x i32> %1, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %4 = add nsw <8 x i32> %2, %3
+ ret <8 x i32> %4
+}
|
1de02fe to
d9eb879
Compare
b4bd5cd to
e216491
Compare
e216491 to
4f8c779
Compare
4f8c779 to
165c3f4
Compare
The `_mm256_madd_epi16` intrinsic performs first a pointwise widening multiplication, and then adds adjacent elements. In SIMD versions of the adler32 checksum algorithm, a trivial multiplication by an all-ones vector is used to get just the widening and addition behavior. In the rust standard library, we like to implement intrinsics in terms of simpler building blocks, so that all backends can implement a small set of primitives instead of supporting all of LLVM's intrinsics. When we try that for `_mm256_madd_epi16` in isolation it works, but when one of the arguments is an all-ones vector, the multiplication is optimized out long before the `vpmaddwd` instruction can be selected. This PR recognizes the widening adjacent addition pattern that adler32 uses directly, and manually inserts a trivial multiplication by an all-ones vector. Experimentally, performing this optimization increases adler32 throughput from 41 gb/s to 67 gb/s (rust-lang/rust#150560 (comment)) cc rust-lang/stdarch#1985 cc rust-lang/rust#150560
…9326) I added an optimization for `VPMADDWD` earlier in #174149. That one is used in the adler32 checksum. That PR missed another pattern, used in base64 decoding, that uses a `shl` instead of a `mul`, but also should optimize to `VPMADDWD`. To make the shift semantically equal to the multiplication case, I'm bailing on shifts by more than 15, because `1 << 16` is not representable in an `i16`. code-wise I suspect that I'm missing some convenient way to access the integer values of a constant vector.
…ADDWD` (#179326) I added an optimization for `VPMADDWD` earlier in llvm/llvm-project#174149. That one is used in the adler32 checksum. That PR missed another pattern, used in base64 decoding, that uses a `shl` instead of a `mul`, but also should optimize to `VPMADDWD`. To make the shift semantically equal to the multiplication case, I'm bailing on shifts by more than 15, because `1 << 16` is not representable in an `i16`. code-wise I suspect that I'm missing some convenient way to access the integer values of a constant vector.
…m#179326) I added an optimization for `VPMADDWD` earlier in llvm#174149. That one is used in the adler32 checksum. That PR missed another pattern, used in base64 decoding, that uses a `shl` instead of a `mul`, but also should optimize to `VPMADDWD`. To make the shift semantically equal to the multiplication case, I'm bailing on shifts by more than 15, because `1 << 16` is not representable in an `i16`. code-wise I suspect that I'm missing some convenient way to access the integer values of a constant vector.
The
_mm256_madd_epi16intrinsic performs first a pointwise widening multiplication, and then adds adjacent elements. In SIMD versions of the adler32 checksum algorithm, a trivial multiplication by an all-ones vector is used to get just the widening and addition behavior.In the rust standard library, we like to implement intrinsics in terms of simpler building blocks, so that all backends can implement a small set of primitives instead of supporting all of LLVM's intrinsics. When we try that for
_mm256_madd_epi16in isolation it works, but when one of the arguments is an all-ones vector, the multiplication is optimized out long before thevpmaddwdinstruction can be selected.This PR recognizes the widening adjacent addition pattern that adler32 uses directly, and manually inserts a trivial multiplication by an all-ones vector. Experimentally, performing this optimization increases adler32 throughput from 41 gb/s to 67 gb/s (rust-lang/rust#150560 (comment))
cc rust-lang/stdarch#1985 rust-lang/rust#150560