-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Generaize reduction tree matching to all integer reductions #68014
[RISCV] Generaize reduction tree matching to all integer reductions #68014
Conversation
This builds on the transform introduced in llvm#67821, and generalizes it for all integer reduction types. A couple of notes: * This will only form smax/smin/umax/umin reductions when zbb is enabled. Otherwise, we lower the min/max expressions early. I don't care about this case, and don't plan to address this further. * This excludes floating point. Floating point introduces concerns about associativity. I may or may not do a follow up patch for that case. * The explodevector test change is mildly undesirable from a clarity perspective. If anyone sees a good way to rewrite that to stablize the test, please suggest.
@llvm/pr-subscribers-backend-risc-v ChangesThis builds on the transform introduced in #67821, and generalizes it for all integer reduction types. A couple of notes:
Patch is 67.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68014.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dd10bca3598b7b6..da7aba66ef8b716 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11108,6 +11108,31 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
+/// Given an integer binary operator, return the generic ISD::VECREDUCE_OP
+/// which corresponds to it.
+static unsigned getVecReduceOpcode(unsigned Opc) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unhandled binary to transfrom reduction");
+ case ISD::ADD:
+ return ISD::VECREDUCE_ADD;
+ case ISD::UMAX:
+ return ISD::VECREDUCE_UMAX;
+ case ISD::SMAX:
+ return ISD::VECREDUCE_SMAX;
+ case ISD::UMIN:
+ return ISD::VECREDUCE_UMIN;
+ case ISD::SMIN:
+ return ISD::VECREDUCE_SMIN;
+ case ISD::AND:
+ return ISD::VECREDUCE_AND;
+ case ISD::OR:
+ return ISD::VECREDUCE_OR;
+ case ISD::XOR:
+ return ISD::VECREDUCE_XOR;
+ }
+};
+
/// Perform two related transforms whose purpose is to incrementally recognize
/// an explode_vector followed by scalar reduction as a vector reduction node.
/// This exists to recover from a deficiency in SLP which can't handle
@@ -11126,8 +11151,15 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
const SDLoc DL(N);
const EVT VT = N->getValueType(0);
- [[maybe_unused]] const unsigned Opc = N->getOpcode();
- assert(Opc == ISD::ADD && "extend this to other reduction types");
+
+ // TODO: Handle floating point here.
+ if (!VT.isInteger())
+ return SDValue();
+
+ const unsigned Opc = N->getOpcode();
+ const unsigned ReduceOpc = getVecReduceOpcode(Opc);
+ assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) &&
+ "Inconsistent mappings");
const SDValue LHS = N->getOperand(0);
const SDValue RHS = N->getOperand(1);
@@ -11157,13 +11189,13 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, 2);
SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
DAG.getVectorIdxConstant(0, DL));
- return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
+ return DAG.getNode(ReduceOpc, DL, VT, Vec);
}
// Match (binop (reduce (extract_subvector V, 0),
// (extract_vector_elt V, sizeof(SubVec))))
// into a reduction of one more element from the original vector V.
- if (LHS.getOpcode() != ISD::VECREDUCE_ADD)
+ if (LHS.getOpcode() != ReduceOpc)
return SDValue();
SDValue ReduceVec = LHS.getOperand(0);
@@ -11179,7 +11211,7 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG,
EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, Idx + 1);
SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec,
DAG.getVectorIdxConstant(0, DL));
- return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec);
+ return DAG.getNode(ReduceOpc, DL, VT, Vec);
}
}
@@ -11687,6 +11719,8 @@ static SDValue performANDCombine(SDNode *N,
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+ return V;
if (DCI.isAfterLegalizeDAG())
if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11739,6 +11773,8 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+ return V;
if (DCI.isAfterLegalizeDAG())
if (SDValue V = combineDeMorganOfBoolean(N, DAG))
@@ -11790,6 +11826,9 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
return V;
+ if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+ return V;
+
// fold (xor (select cond, 0, y), x) ->
// (select cond, x, (xor x, y))
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
@@ -13995,8 +14034,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SMAX:
case ISD::SMIN:
case ISD::FMAXNUM:
- case ISD::FMINNUM:
- return combineBinOpToReduce(N, DAG, Subtarget);
+ case ISD::FMINNUM: {
+ if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget))
+ return V;
+ if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
+ return V;
+ return SDValue();
+ }
case ISD::SETCC:
return performSETCCCombine(N, DAG, Subtarget);
case ISD::SIGN_EXTEND_INREG:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index ab137b1ac818299..f3570495600f3c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -5,11 +5,10 @@
define i8 @explode_2xi8(<2 x i8> %v) {
; CHECK-LABEL: explode_2xi8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: xor a0, a0, a1
; CHECK-NEXT: ret
%e0 = extractelement <2 x i8> %v, i32 0
%e1 = extractelement <2 x i8> %v, i32 1
@@ -21,16 +20,16 @@ define i8 @explode_4xi8(<4 x i8> %v) {
; CHECK-LABEL: explode_4xi8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a2, v9
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vmv.x.s a3, v8
-; CHECK-NEXT: xor a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: ret
%e0 = extractelement <4 x i8> %v, i32 0
%e1 = extractelement <4 x i8> %v, i32 1
@@ -47,28 +46,28 @@ define i8 @explode_8xi8(<8 x i8> %v) {
; CHECK-LABEL: explode_8xi8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vmv.x.s a0, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vmv.x.s a2, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.x.s a3, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s a6, v9
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a7, v8
-; CHECK-NEXT: xor a0, a0, a1
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a6, a0
; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a2, a2, a4
; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, a4, a6
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, a0, a7
+; CHECK-NEXT: add a0, a0, a5
; CHECK-NEXT: ret
%e0 = extractelement <8 x i8> %v, i32 0
%e1 = extractelement <8 x i8> %v, i32 1
@@ -89,119 +88,56 @@ define i8 @explode_8xi8(<8 x i8> %v) {
}
define i8 @explode_16xi8(<16 x i8> %v) {
-; RV32-LABEL: explode_16xi8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vmv.x.s a1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: vmv.x.s a2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 3
-; RV32-NEXT: vmv.x.s a3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 4
-; RV32-NEXT: vmv.x.s a4, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 5
-; RV32-NEXT: vmv.x.s a5, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 6
-; RV32-NEXT: vmv.x.s a6, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 7
-; RV32-NEXT: vmv.x.s a7, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 8
-; RV32-NEXT: vmv.x.s t0, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 9
-; RV32-NEXT: vmv.x.s t1, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 10
-; RV32-NEXT: vmv.x.s t2, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 11
-; RV32-NEXT: vmv.x.s t3, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 12
-; RV32-NEXT: vmv.x.s t4, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 13
-; RV32-NEXT: vmv.x.s t5, v9
-; RV32-NEXT: vslidedown.vi v9, v8, 14
-; RV32-NEXT: vmv.x.s t6, v9
-; RV32-NEXT: vslidedown.vi v8, v8, 15
-; RV32-NEXT: vmv.x.s s0, v8
-; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a7, a7, t1
-; RV32-NEXT: add a7, a7, t2
-; RV32-NEXT: add a0, a0, a7
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: add t3, t3, t5
-; RV32-NEXT: add t3, t3, t6
-; RV32-NEXT: add t3, t3, s0
-; RV32-NEXT: add a0, a0, t3
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_16xi8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vmv.x.s a1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: vmv.x.s a2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 3
-; RV64-NEXT: vmv.x.s a3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vmv.x.s a4, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 5
-; RV64-NEXT: vmv.x.s a5, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 6
-; RV64-NEXT: vmv.x.s a6, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 7
-; RV64-NEXT: vmv.x.s a7, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 8
-; RV64-NEXT: vmv.x.s t0, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 9
-; RV64-NEXT: vmv.x.s t1, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 10
-; RV64-NEXT: vmv.x.s t2, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 11
-; RV64-NEXT: vmv.x.s t3, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 12
-; RV64-NEXT: vmv.x.s t4, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 13
-; RV64-NEXT: vmv.x.s t5, v9
-; RV64-NEXT: vslidedown.vi v9, v8, 14
-; RV64-NEXT: vmv.x.s t6, v9
-; RV64-NEXT: vslidedown.vi v8, v8, 15
-; RV64-NEXT: vmv.x.s s0, v8
-; RV64-NEXT: xor a0, a0, a1
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: add a0, a0, a2
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: add a4, a4, a6
-; RV64-NEXT: add a0, a0, a4
-; RV64-NEXT: add a7, a7, t0
-; RV64-NEXT: add a7, a7, t1
-; RV64-NEXT: add a7, a7, t2
-; RV64-NEXT: add a0, a0, a7
-; RV64-NEXT: add t3, t3, t4
-; RV64-NEXT: add t3, t3, t5
-; RV64-NEXT: add t3, t3, t6
-; RV64-NEXT: add t3, t3, s0
-; RV64-NEXT: add a0, a0, t3
-; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: explode_16xi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 5
+; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 6
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 8
+; CHECK-NEXT: vmv.x.s a6, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 9
+; CHECK-NEXT: vmv.x.s a7, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 10
+; CHECK-NEXT: vmv.x.s t0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 11
+; CHECK-NEXT: vmv.x.s t1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 12
+; CHECK-NEXT: vmv.x.s t2, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 13
+; CHECK-NEXT: vmv.x.s t3, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 14
+; CHECK-NEXT: vmv.x.s t4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 15
+; CHECK-NEXT: vmv.x.s t5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s t6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, t6, a0
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a2, a2, a4
+; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: add a5, a5, a6
+; CHECK-NEXT: add a5, a5, a7
+; CHECK-NEXT: add a5, a5, t0
+; CHECK-NEXT: add a0, a0, a5
+; CHECK-NEXT: add t1, t1, t2
+; CHECK-NEXT: add t1, t1, t3
+; CHECK-NEXT: add t1, t1, t4
+; CHECK-NEXT: add t1, t1, t5
+; CHECK-NEXT: add a0, a0, t1
+; CHECK-NEXT: ret
%e0 = extractelement <16 x i8> %v, i32 0
%e1 = extractelement <16 x i8> %v, i32 1
%e2 = extractelement <16 x i8> %v, i32 2
@@ -239,11 +175,10 @@ define i8 @explode_16xi8(<16 x i8> %v) {
define i16 @explode_2xi16(<2 x i16> %v) {
; CHECK-LABEL: explode_2xi16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vredxor.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: xor a0, a0, a1
; CHECK-NEXT: ret
%e0 = extractelement <2 x i16> %v, i32 0
%e1 = extractelement <2 x i16> %v, i32 1
@@ -255,16 +190,16 @@ define i16 @explode_4xi16(<4 x i16> %v) {
; CHECK-LABEL: explode_4xi16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a2, v9
-; CHECK-NEXT: vslidedown.vi v8, v8, 3
-; CHECK-NEXT: vmv.x.s a3, v8
-; CHECK-NEXT: xor a0, a0, a1
-; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: add a0, a0, a2
+; CHECK-NEXT: vmv.x.s a0, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a2, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a2, a0
; CHECK-NEXT: ret
%e0 = extractelement <4 x i16> %v, i32 0
%e1 = extractelement <4 x i16> %v, i32 1
@@ -281,28 +216,28 @@ define i16 @explode_8xi16(<8 x i16> %v) {
; CHECK-LABEL: explode_8xi16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 2
-; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: vmv.x.s a0, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 3
-; CHECK-NEXT: vmv.x.s a3, v9
+; CHECK-NEXT: vmv.x.s a1, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 4
-; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vmv.x.s a2, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 5
-; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.x.s a3, v9
; CHECK-NEXT: vslidedown.vi v9, v8, 6
-; CHECK-NEXT: vmv.x.s a6, v9
-; CHECK-NEXT: vslidedown.vi v8, v8, 7
-; CHECK-NEXT: vmv.x.s a7, v8
-; CHECK-NEXT: xor a0, a0, a1
+; CHECK-NEXT: vmv.x.s a4, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 7
+; CHECK-NEXT: vmv.x.s a5, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vredxor.vs v8, v8, v9
+; CHECK-NEXT: vmv.x.s a6, v8
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: add a0, a6, a0
; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: add a2, a2, a4
; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: add a4, a4, a5
-; CHECK-NEXT: add a4, a4, a6
-; CHECK-NEXT: add a0, a0, a4
-; CHECK-NEXT: add a0, a0, a7
+; CHECK-NEXT: add a0, a0, a5
; CHECK-NEXT: ret
%e0 = extractelement <8 x i16> %v, i32 0
%e1 = extractelement <8 x i16> %v, i32 1
@@ -323,121 +258,57 @@ define i16 @explode_8xi16(<8 x i16> %v) {
}
define i16 @explode_16xi16(<16 x i16> %v) {
-; RV32-LABEL: explode_16xi16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vmv.x.s a1, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vmv.x.s a2, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 3
-; RV32-NEXT: vmv.x.s a3, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 4
-; RV32-NEXT: vmv.x.s a4, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 5
-; RV32-NEXT: vmv.x.s a5, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 6
-; RV32-NEXT: vmv.x.s a6, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 7
-; RV32-NEXT: vmv.x.s a7, v10
-; RV32-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 8
-; RV32-NEXT: vmv.x.s t0, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 9
-; RV32-NEXT: vmv.x.s t1, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 10
-; RV32-NEXT: vmv.x.s t2, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 11
-; RV32-NEXT: vmv.x.s t3, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 12
-; RV32-NEXT: vmv.x.s t4, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 13
-; RV32-NEXT: vmv.x.s t5, v10
-; RV32-NEXT: vslidedown.vi v10, v8, 14
-; RV32-NEXT: vmv.x.s t6, v10
-; RV32-NEXT: vslidedown.vi v8, v8, 15
-; RV32-NEXT: vmv.x.s s0, v8
-; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a4, a4, a5
-; RV32-NEXT: add a4, a4, a6
-; RV32-NEXT: add a0, a0, a4
-; RV32-NEXT: add a7, a7, t0
-; RV32-NEXT: add a7, a7, t1
-; RV32-NEXT: add a7, a7, t2
-; RV32-NEXT: add a0, a0, a7
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: add t3, t3, t5
-; RV32-NEXT: add t3, t3, t6
-; RV32-NEXT: add t3, t3, s0
-; RV32-NEXT: add a0, a0, t3
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: explode_16xi16:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset s0, -8
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vmv.x.s a1, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vmv.x.s a2, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 3
-; RV64-NEXT: vmv.x.s a3, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 4
-; RV64-NEXT: vmv.x.s a4, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 5
-; RV64-NEXT: vmv.x.s a5, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 6
-; RV64-NEXT: vmv.x.s a6, v10
-; RV64-NEXT: vslidedown.vi v10, v8, 7
-; RV64-NEXT: vmv.x.s a7, v10
-; RV64-NEXT: vsetivli zero, 1, e16, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 8
-; RV64-NEXT: vmv.x.s t0, v10
-; RV6...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 01797dad8686a1e7276ccd33f16934d31aa7c98a 07cddbc4a8c3f8052d80cca969cd71ed86e0933e -- llvm/lib/Target/RISCV/RISCVISelLowering.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index da7aba66ef8b..1ca9eaac5b20 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3530,8 +3530,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
(NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
unsigned SignBits = DAG.ComputeNumSignBits(Op);
if (EltBitSize - SignBits < 8) {
- SDValue Source =
- DAG.getNode(ISD::TRUNCATE, DL, VT.changeVectorElementType(MVT::i8), Op);
+ SDValue Source = DAG.getNode(ISD::TRUNCATE, DL,
+ VT.changeVectorElementType(MVT::i8), Op);
Source = convertToScalableVector(ContainerVT.changeVectorElementType(MVT::i8),
Source, DAG, Subtarget);
SDValue Res = DAG.getNode(RISCVISD::VSEXT_VL, DL, ContainerVT, Source, Mask, VL);
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
case ISD::XOR: | ||
return ISD::VECREDUCE_XOR; | ||
} | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: do we intend to have this extra semicolon after the bracket?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 63bbc25
…ctions (#68014)" This reverts commit 7a0b9da and 63bbc25. I'm seeing issues (e.g. on the GCC torture suite) where combineBinOpOfExtractToReduceTree is called when the V extensions aren't enabled and triggers a crash due to RISCVSubtarget::getElen asserting. I'll aim to follow up with a minimal reproducer. Although it's pretty obvious how to avoid this crash with some extra gating, there are a few options as to where that should be inserted so I think it's best to revert and agree the appropriate fix separately.
I reverted this in 824251c because I was seeing downstream failures due to the unguarded call to I've found you actually can reproduce the crash by just running the fixed-vectors-reduction-formation.ll test without Alternatively, here is a bugpoint reduced example derived from 20050604-1.c in the GCC torture suite: %union.anon.2.3.19.20 = type { <4 x i16> }
@u = external global %union.anon.2.3.19.20, align 8
declare void @foo()
define void @main() {
entry:
%u.promoted.i = load <4 x i16>, ptr @u, align 8
%add.1.i = add <4 x i16> %u.promoted.i, <i16 24, i16 0, i16 0, i16 0>
%0 = extractelement <4 x i16> %add.1.i, i64 2
%tobool = icmp ne i16 %0, 0
%or.cond = select i1 false, i1 true, i1 %tobool
%1 = extractelement <4 x i16> %add.1.i, i64 3
%tobool5 = icmp ne i16 %1, 0
%or.cond6 = select i1 %or.cond, i1 true, i1 %tobool5
br i1 %or.cond6, label %if.then, label %if.end
if.then: ; preds = %entry
unreachable
if.end: ; preds = %entry
ret void
}
declare void @abort() One obvious fix would be to return immediately from |
|
||
const unsigned Opc = N->getOpcode(); | ||
const unsigned ReduceOpc = getVecReduceOpcode(Opc); | ||
assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe isOperationLegalOrCustomOrPromote(ReduceOpc, VT)
?
…68014) (reapply) This was reverted in 824251c exposed by this change in a previous patch. Fixed in 199cbec. Original commit message follows. This builds on the transform introduced in #67821, and generalizes it for all integer reduction types. A couple of notes: * This will only form smax/smin/umax/umin reductions when zbb is enabled. Otherwise, we lower the min/max expressions early. I don't care about this case, and don't plan to address this further. * This excludes floating point. Floating point introduces concerns about associativity. I may or may not do a follow up patch for that case. * The explodevector test change is mildly undesirable from a clarity perspective. If anyone sees a good way to rewrite that to stablize the test, please suggest.
This builds on the transform introduced in #67821, and generalizes it for all integer reduction types.
A couple of notes: