Improve ISel using across lane min/max reduction

Jun Bum Lim · Jun Bum Lim · commit 5ffe2bacea66 · 2015-09-14T16:19:52.000Z
In vectorized integer min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : %svn0 = vector_shuffle %0, undef<2,3,u,u> %smax0 = smax %0, svn0 %svn3 = vector_shuffle %smax0, undef<1,u,u,u> %sc = setcc %smax0, %svn3, gt %n0 = extract_vector_elt %sc, #0 %n1 = extract_vector_elt %smax0, #0 %n2 = extract_vector_elt $smax0, #1 %result = select %n0, %n1, n2 becomes : %1 = smaxv %0 %result = extract_vector_elt %1, 0 This change extends r246790. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247575 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N,
   return SDValue();
 }
 
-/// Target-specific DAG combine for the across vector reduction.
-/// This function specifically handles the final clean-up step of a vector
-/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
-/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
-/// elements are reduced, where s is an induction variable from 0
-/// to log2(NumVectorElements).
-/// For example,
-///   %1 = vector_shuffle %0, <2,3,u,u>
-///   %2 = add %0, %1
-///   %3 = vector_shuffle %2, <1,u,u,u>
-///   %4 = add %2, %3
-///   %5 = extract_vector_elt %4, 0
-/// becomes :
-///   %0 = uaddv %0
-///   %1 = extract_vector_elt %0, 0
-///
-/// FIXME: Currently this function is implemented and tested specifically
-/// for the add reduction. We could also support other types of across lane
-/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
-/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
-static SDValue
-performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
-                                  const AArch64Subtarget *Subtarget) {
-  if (!Subtarget->hasNEON())
+/// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+                                                     unsigned Op,
+                                                     SelectionDAG &DAG) {
+  EVT VTy = OpV->getOperand(0).getValueType();
+  if (!VTy.isVector())
     return SDValue();
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
 
-  // Check if the input vector is fed by the operator we want to handle.
-  // We specifically check only ADD for now.
-  if (N0->getOpcode() != ISD::ADD)
-    return SDValue();
-
-  // The vector extract idx must constant zero because we only expect the final
-  // result of the reduction is placed in lane 0.
-  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
-    return SDValue();
-
-  EVT EltTy = N0.getValueType().getVectorElementType();
-  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
-    return SDValue();
-
-  int NumVecElts = N0.getValueType().getVectorNumElements();
+  int NumVecElts = VTy.getVectorNumElements();
   if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
     return SDValue();
 
   int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
-  SDValue PreOp = N0;
+  SDValue PreOp = OpV;
   // Iterate over each step of the across vector reduction.
   for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
-    // We specifically check ADD for now.
-    if (PreOp.getOpcode() != ISD::ADD)
-      return SDValue();
     SDValue CurOp = PreOp.getOperand(0);
     SDValue Shuffle = PreOp.getOperand(1);
     if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
-      // Try to swap the 1st and 2nd operand as add is commutative.
+      // Try to swap the 1st and 2nd operand as add and min/max instructions
+      // are commutative.
       CurOp = PreOp.getOperand(1);
       Shuffle = PreOp.getOperand(0);
       if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
         return SDValue();
     }
+
+    // Check if the input vector is fed by the operator we want to handle,
+    // except the last step; the very first input vector is not necessarily
+    // the same operator we are handling.
+    if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+      return SDValue();
+
     // Check if it forms one step of the across vector reduction.
     // E.g.,
     //   %cur = add %1, %0
@@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
 
     PreOp = CurOp;
   }
+  unsigned Opcode;
+  switch (Op) {
+  default:
+    llvm_unreachable("Unexpected operator for across vector reduction");
+  case ISD::ADD:
+    Opcode = AArch64ISD::UADDV;
+    break;
+  case ISD::SMAX:
+    Opcode = AArch64ISD::SMAXV;
+    break;
+  case ISD::UMAX:
+    Opcode = AArch64ISD::UMAXV;
+    break;
+  case ISD::SMIN:
+    Opcode = AArch64ISD::SMINV;
+    break;
+  case ISD::UMIN:
+    Opcode = AArch64ISD::UMINV;
+    break;
+  }
   SDLoc DL(N);
-  return DAG.getNode(
-      ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
-      DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
-      DAG.getConstant(0, DL, MVT::i64));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                     DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+///   svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+///   %smax0 = smax %arr, svn0
+///   %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax1 = smax %smax0, %svn1
+///   %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax2 = smax %smax1, svn2
+///   %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %sc = setcc %smax2, %svn3, gt
+///   %n0 = extract_vector_elt %sc, #0
+///   %n1 = extract_vector_elt %smax2, #0
+///   %n2 = extract_vector_elt $smax2, #1
+///   %result = select %n0, %n1, n2
+///     becomes :
+///   %1 = smaxv %0
+///   %result = extract_vector_elt %1, 0
+/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
+/// We could also support other types of across lane reduction available
+/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                        const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+
+  // Check if the SELECT merges up the final result of the min/max
+  // from a vector.
+  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Expect N0 is fed by SETCC.
+  SDValue SetCC = N0.getOperand(0);
+  EVT SetCCVT = SetCC.getValueType();
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+      SetCCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue VectorOp = SetCC.getOperand(0);
+  unsigned Op = VectorOp->getOpcode();
+  // Check if the input vector is fed by the operator we want to handle.
+  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
+    return SDValue();
+
+  EVT VTy = VectorOp.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  // Check if extracting from the same vector.
+  // For example,
+  //   %sc = setcc %vector, %svn1, gt
+  //   %n0 = extract_vector_elt %sc, #0
+  //   %n1 = extract_vector_elt %vector, #0
+  //   %n2 = extract_vector_elt $vector, #1
+  if (!(VectorOp == IfTrue->getOperand(0) &&
+        VectorOp == IfFalse->getOperand(0)))
+    return SDValue();
+
+  // Check if the condition code is matched with the operator type.
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+      (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+      (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
+    return SDValue();
+
+  // Expect to check only lane 0 from the vector SETCC.
+  if (!isa<ConstantSDNode>(N0.getOperand(1)) ||
+      cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0)
+    return SDValue();
+
+  // Expect to extract the true value from lane 0.
+  if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) ||
+      cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0)
+    return SDValue();
+
+  // Expect to extract the false value from lane 1.
+  if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) ||
+      cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+///   %1 = vector_shuffle %0, <2,3,u,u>
+///   %2 = add %0, %1
+///   %3 = vector_shuffle %2, <1,u,u,u>
+///   %4 = add %2, %3
+///   %result = extract_vector_elt %4, 0
+/// becomes :
+///   %0 = uaddv %0
+///   %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check if the input vector is fed by the ADD.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The vector extract idx must constant zero because we only expect the final
+  // result of the reduction is placed in lane 0.
+  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0)
+    return SDValue();
+
+  EVT VTy = N0.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
 }
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
@@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT:
-    return performSelectCombine(N, DCI);
+  case ISD::SELECT: {
+    SDValue RV = performSelectCombine(N, DCI);
+    if (!RV.getNode())
+      RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+    return RV;
+  }
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
   case ISD::STORE:
@@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::EXTRACT_VECTOR_ELT:
-    return performAcrossLaneReductionCombine(N, DAG, Subtarget);
+    return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
 
-define i8 @f_v16i8(<16 x i8>* %arr)  {
-; CHECK-LABEL: f_v16i8
+define i8 @add_B(<16 x i8>* %arr)  {
+; CHECK-LABEL: add_B
 ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
   %bin.rdx = load <16 x i8>, <16 x i8>* %arr
   %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -16,8 +16,8 @@ define i8 @f_v16i8(<16 x i8>* %arr)  {
   ret i8 %r
 }
 
-define i16 @f_v8i16(<8 x i16>* %arr)  {
-; CHECK-LABEL: f_v8i16
+define i16 @add_H(<8 x i16>* %arr)  {
+; CHECK-LABEL: add_H
 ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
   %bin.rdx = load <8 x i16>, <8 x i16>* %arr
   %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
@@ -30,8 +30,8 @@ define i16 @f_v8i16(<8 x i16>* %arr)  {
   ret i16 %r
 }
 
-define i32 @f_v4i32( <4 x i32>* %arr)  {
-; CHECK-LABEL: f_v4i32
+define i32 @add_S( <4 x i32>* %arr)  {
+; CHECK-LABEL: add_S
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <4 x i32>, <4 x i32>* %arr
   %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -42,8 +42,8 @@ define i32 @f_v4i32( <4 x i32>* %arr)  {
   ret i32 %r
 }
 
-define i64 @f_v2i64(<2 x i64>* %arr)  {
-; CHECK-LABEL: f_v2i64
+define i64 @add_D(<2 x i64>* %arr)  {
+; CHECK-LABEL: add_D
 ; CHECK-NOT: addv
   %bin.rdx = load <2 x i64>, <2 x i64>* %arr
   %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll