Skip to content
This repository was archived by the owner on Sep 2, 2018. It is now read-only.

Commit 5ffe2ba

Browse files
author
Jun Bum Lim
committed
Improve ISel using across lane min/max reduction
In vectorized integer min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : %svn0 = vector_shuffle %0, undef<2,3,u,u> %smax0 = smax %0, svn0 %svn3 = vector_shuffle %smax0, undef<1,u,u,u> %sc = setcc %smax0, %svn3, gt %n0 = extract_vector_elt %sc, #0 %n1 = extract_vector_elt %smax0, #0 %n2 = extract_vector_elt $smax0, #1 %result = select %n0, %n1, n2 becomes : %1 = smaxv %0 %result = extract_vector_elt %1, 0 This change extends r246790. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247575 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 294cc52 commit 5ffe2ba

File tree

3 files changed

+485
-61
lines changed

3 files changed

+485
-61
lines changed

lib/Target/AArch64/AArch64ISelLowering.cpp

+190-53
Original file line numberDiff line numberDiff line change
@@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N,
85858585
return SDValue();
85868586
}
85878587

8588-
/// Target-specific DAG combine for the across vector reduction.
8589-
/// This function specifically handles the final clean-up step of a vector
8590-
/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
8591-
/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
8592-
/// elements are reduced, where s is an induction variable from 0
8593-
/// to log2(NumVectorElements).
8594-
/// For example,
8595-
/// %1 = vector_shuffle %0, <2,3,u,u>
8596-
/// %2 = add %0, %1
8597-
/// %3 = vector_shuffle %2, <1,u,u,u>
8598-
/// %4 = add %2, %3
8599-
/// %5 = extract_vector_elt %4, 0
8600-
/// becomes :
8601-
/// %0 = uaddv %0
8602-
/// %1 = extract_vector_elt %0, 0
8603-
///
8604-
/// FIXME: Currently this function is implemented and tested specifically
8605-
/// for the add reduction. We could also support other types of across lane
8606-
/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
8607-
/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
8608-
static SDValue
8609-
performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
8610-
const AArch64Subtarget *Subtarget) {
8611-
if (!Subtarget->hasNEON())
8588+
/// This function handles the log2-shuffle pattern produced by the
8589+
/// LoopVectorizer for the across vector reduction. It consists of
8590+
/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
8591+
/// are reduced, where s is an induction variable from 0 to
8592+
/// log2(NumVectorElements).
8593+
static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
8594+
unsigned Op,
8595+
SelectionDAG &DAG) {
8596+
EVT VTy = OpV->getOperand(0).getValueType();
8597+
if (!VTy.isVector())
86128598
return SDValue();
8613-
SDValue N0 = N->getOperand(0);
8614-
SDValue N1 = N->getOperand(1);
86158599

8616-
// Check if the input vector is fed by the operator we want to handle.
8617-
// We specifically check only ADD for now.
8618-
if (N0->getOpcode() != ISD::ADD)
8619-
return SDValue();
8620-
8621-
// The vector extract idx must constant zero because we only expect the final
8622-
// result of the reduction is placed in lane 0.
8623-
if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
8624-
return SDValue();
8625-
8626-
EVT EltTy = N0.getValueType().getVectorElementType();
8627-
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8628-
return SDValue();
8629-
8630-
int NumVecElts = N0.getValueType().getVectorNumElements();
8600+
int NumVecElts = VTy.getVectorNumElements();
86318601
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
86328602
return SDValue();
86338603

86348604
int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
8635-
SDValue PreOp = N0;
8605+
SDValue PreOp = OpV;
86368606
// Iterate over each step of the across vector reduction.
86378607
for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
8638-
// We specifically check ADD for now.
8639-
if (PreOp.getOpcode() != ISD::ADD)
8640-
return SDValue();
86418608
SDValue CurOp = PreOp.getOperand(0);
86428609
SDValue Shuffle = PreOp.getOperand(1);
86438610
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
8644-
// Try to swap the 1st and 2nd operand as add is commutative.
8611+
// Try to swap the 1st and 2nd operand as add and min/max instructions
8612+
// are commutative.
86458613
CurOp = PreOp.getOperand(1);
86468614
Shuffle = PreOp.getOperand(0);
86478615
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
86488616
return SDValue();
86498617
}
8618+
8619+
// Check if the input vector is fed by the operator we want to handle,
8620+
// except the last step; the very first input vector is not necessarily
8621+
// the same operator we are handling.
8622+
if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
8623+
return SDValue();
8624+
86508625
// Check if it forms one step of the across vector reduction.
86518626
// E.g.,
86528627
// %cur = add %1, %0
@@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
86748649

86758650
PreOp = CurOp;
86768651
}
8652+
unsigned Opcode;
8653+
switch (Op) {
8654+
default:
8655+
llvm_unreachable("Unexpected operator for across vector reduction");
8656+
case ISD::ADD:
8657+
Opcode = AArch64ISD::UADDV;
8658+
break;
8659+
case ISD::SMAX:
8660+
Opcode = AArch64ISD::SMAXV;
8661+
break;
8662+
case ISD::UMAX:
8663+
Opcode = AArch64ISD::UMAXV;
8664+
break;
8665+
case ISD::SMIN:
8666+
Opcode = AArch64ISD::SMINV;
8667+
break;
8668+
case ISD::UMIN:
8669+
Opcode = AArch64ISD::UMINV;
8670+
break;
8671+
}
86778672
SDLoc DL(N);
8678-
return DAG.getNode(
8679-
ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
8680-
DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
8681-
DAG.getConstant(0, DL, MVT::i64));
8673+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
8674+
DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
8675+
DAG.getConstant(0, DL, MVT::i64));
8676+
}
8677+
8678+
/// Target-specific DAG combine for the across vector min/max reductions.
8679+
/// This function specifically handles the final clean-up step of the vector
8680+
/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
8681+
/// pattern, which narrows down and finds the final min/max value from all
8682+
/// elements of the vector.
8683+
/// For example, for a <16 x i8> vector :
8684+
/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
8685+
/// %smax0 = smax %arr, svn0
8686+
/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
8687+
/// %smax1 = smax %smax0, %svn1
8688+
/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
8689+
/// %smax2 = smax %smax1, svn2
8690+
/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
8691+
/// %sc = setcc %smax2, %svn3, gt
8692+
/// %n0 = extract_vector_elt %sc, #0
8693+
/// %n1 = extract_vector_elt %smax2, #0
8694+
/// %n2 = extract_vector_elt $smax2, #1
8695+
/// %result = select %n0, %n1, n2
8696+
/// becomes :
8697+
/// %1 = smaxv %0
8698+
/// %result = extract_vector_elt %1, 0
8699+
/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
8700+
/// We could also support other types of across lane reduction available
8701+
/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
8702+
static SDValue
8703+
performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
8704+
const AArch64Subtarget *Subtarget) {
8705+
if (!Subtarget->hasNEON())
8706+
return SDValue();
8707+
8708+
SDValue N0 = N->getOperand(0);
8709+
SDValue IfTrue = N->getOperand(1);
8710+
SDValue IfFalse = N->getOperand(2);
8711+
8712+
// Check if the SELECT merges up the final result of the min/max
8713+
// from a vector.
8714+
if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8715+
IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8716+
IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8717+
return SDValue();
8718+
8719+
// Expect N0 is fed by SETCC.
8720+
SDValue SetCC = N0.getOperand(0);
8721+
EVT SetCCVT = SetCC.getValueType();
8722+
if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
8723+
SetCCVT.getVectorElementType() != MVT::i1)
8724+
return SDValue();
8725+
8726+
SDValue VectorOp = SetCC.getOperand(0);
8727+
unsigned Op = VectorOp->getOpcode();
8728+
// Check if the input vector is fed by the operator we want to handle.
8729+
if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
8730+
return SDValue();
8731+
8732+
EVT VTy = VectorOp.getValueType();
8733+
if (!VTy.isVector())
8734+
return SDValue();
8735+
8736+
EVT EltTy = VTy.getVectorElementType();
8737+
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8738+
return SDValue();
8739+
8740+
// Check if extracting from the same vector.
8741+
// For example,
8742+
// %sc = setcc %vector, %svn1, gt
8743+
// %n0 = extract_vector_elt %sc, #0
8744+
// %n1 = extract_vector_elt %vector, #0
8745+
// %n2 = extract_vector_elt $vector, #1
8746+
if (!(VectorOp == IfTrue->getOperand(0) &&
8747+
VectorOp == IfFalse->getOperand(0)))
8748+
return SDValue();
8749+
8750+
// Check if the condition code is matched with the operator type.
8751+
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
8752+
if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
8753+
(Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
8754+
(Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
8755+
(Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
8756+
return SDValue();
8757+
8758+
// Expect to check only lane 0 from the vector SETCC.
8759+
if (!isa<ConstantSDNode>(N0.getOperand(1)) ||
8760+
cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0)
8761+
return SDValue();
8762+
8763+
// Expect to extract the true value from lane 0.
8764+
if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) ||
8765+
cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0)
8766+
return SDValue();
8767+
8768+
// Expect to extract the false value from lane 1.
8769+
if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) ||
8770+
cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1)
8771+
return SDValue();
8772+
8773+
return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
8774+
}
8775+
8776+
/// Target-specific DAG combine for the across vector add reduction.
8777+
/// This function specifically handles the final clean-up step of the vector
8778+
/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
8779+
/// pattern, which adds all elements of a vector together.
8780+
/// For example, for a <4 x i32> vector :
8781+
/// %1 = vector_shuffle %0, <2,3,u,u>
8782+
/// %2 = add %0, %1
8783+
/// %3 = vector_shuffle %2, <1,u,u,u>
8784+
/// %4 = add %2, %3
8785+
/// %result = extract_vector_elt %4, 0
8786+
/// becomes :
8787+
/// %0 = uaddv %0
8788+
/// %result = extract_vector_elt %0, 0
8789+
static SDValue
8790+
performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
8791+
const AArch64Subtarget *Subtarget) {
8792+
if (!Subtarget->hasNEON())
8793+
return SDValue();
8794+
SDValue N0 = N->getOperand(0);
8795+
SDValue N1 = N->getOperand(1);
8796+
8797+
// Check if the input vector is fed by the ADD.
8798+
if (N0->getOpcode() != ISD::ADD)
8799+
return SDValue();
8800+
8801+
// The vector extract idx must constant zero because we only expect the final
8802+
// result of the reduction is placed in lane 0.
8803+
if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0)
8804+
return SDValue();
8805+
8806+
EVT VTy = N0.getValueType();
8807+
if (!VTy.isVector())
8808+
return SDValue();
8809+
8810+
EVT EltTy = VTy.getVectorElementType();
8811+
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8812+
return SDValue();
8813+
8814+
return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
86828815
}
86838816

86848817
/// Target-specific DAG combine function for NEON load/store intrinsics
@@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
92599392
return performBitcastCombine(N, DCI, DAG);
92609393
case ISD::CONCAT_VECTORS:
92619394
return performConcatVectorsCombine(N, DCI, DAG);
9262-
case ISD::SELECT:
9263-
return performSelectCombine(N, DCI);
9395+
case ISD::SELECT: {
9396+
SDValue RV = performSelectCombine(N, DCI);
9397+
if (!RV.getNode())
9398+
RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
9399+
return RV;
9400+
}
92649401
case ISD::VSELECT:
92659402
return performVSelectCombine(N, DCI.DAG);
92669403
case ISD::STORE:
@@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
92769413
case ISD::INSERT_VECTOR_ELT:
92779414
return performPostLD1Combine(N, DCI, true);
92789415
case ISD::EXTRACT_VECTOR_ELT:
9279-
return performAcrossLaneReductionCombine(N, DAG, Subtarget);
9416+
return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
92809417
case ISD::INTRINSIC_VOID:
92819418
case ISD::INTRINSIC_W_CHAIN:
92829419
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {

test/CodeGen/AArch64/aarch64-addv.ll

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
22

3-
define i8 @f_v16i8(<16 x i8>* %arr) {
4-
; CHECK-LABEL: f_v16i8
3+
define i8 @add_B(<16 x i8>* %arr) {
4+
; CHECK-LABEL: add_B
55
; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
66
%bin.rdx = load <16 x i8>, <16 x i8>* %arr
77
%rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -16,8 +16,8 @@ define i8 @f_v16i8(<16 x i8>* %arr) {
1616
ret i8 %r
1717
}
1818

19-
define i16 @f_v8i16(<8 x i16>* %arr) {
20-
; CHECK-LABEL: f_v8i16
19+
define i16 @add_H(<8 x i16>* %arr) {
20+
; CHECK-LABEL: add_H
2121
; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
2222
%bin.rdx = load <8 x i16>, <8 x i16>* %arr
2323
%rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
@@ -30,8 +30,8 @@ define i16 @f_v8i16(<8 x i16>* %arr) {
3030
ret i16 %r
3131
}
3232

33-
define i32 @f_v4i32( <4 x i32>* %arr) {
34-
; CHECK-LABEL: f_v4i32
33+
define i32 @add_S( <4 x i32>* %arr) {
34+
; CHECK-LABEL: add_S
3535
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
3636
%bin.rdx = load <4 x i32>, <4 x i32>* %arr
3737
%rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -42,8 +42,8 @@ define i32 @f_v4i32( <4 x i32>* %arr) {
4242
ret i32 %r
4343
}
4444

45-
define i64 @f_v2i64(<2 x i64>* %arr) {
46-
; CHECK-LABEL: f_v2i64
45+
define i64 @add_D(<2 x i64>* %arr) {
46+
; CHECK-LABEL: add_D
4747
; CHECK-NOT: addv
4848
%bin.rdx = load <2 x i64>, <2 x i64>* %arr
4949
%rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>

0 commit comments

Comments
 (0)