diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 59bfec30dc211..c39b9d55cc212 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -18456,36 +18456,80 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, return SDValue(); } -SDValue PPCTargetLowering::combineVectorSHL(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue PPCTargetLowering::combineVectorShift(SDNode *N, + DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); assert(VT.isVector() && "Vector type expected."); - SDValue N1 = N->getOperand(1); - if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR || - !isOperationLegal(ISD::ADD, VT)) + unsigned Opc = N->getOpcode(); + assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) && + "Unexpected opcode."); + + if (!isOperationLegal(Opc, VT)) return SDValue(); - // For 64-bit there is no splat immediate so we want to catch shift by 1 here - // before the BUILD_VECTOR is replaced by a load. EVT EltTy = VT.getScalarType(); - if (EltTy != MVT::i64) + unsigned EltBits = EltTy.getSizeInBits(); + if (EltTy != MVT::i64 && EltTy != MVT::i32) return SDValue(); - BuildVectorSDNode *BVN = cast(N1); - APInt APSplatBits, APSplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - bool BVNIsConstantSplat = - BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, - HasAnyUndefs, 0, !Subtarget.isLittleEndian()); - if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits()) + SDValue N1 = N->getOperand(1); + uint64_t SplatBits = 0; + bool AddSplatCase = false; + unsigned OpcN1 = N1.getOpcode(); + if (OpcN1 == PPCISD::VADD_SPLAT && + N1.getConstantOperandVal(1) == VT.getVectorNumElements()) { + AddSplatCase = true; + SplatBits = N1.getConstantOperandVal(0); + } + + if (!AddSplatCase) { + if (OpcN1 != ISD::BUILD_VECTOR) + return SDValue(); + + unsigned SplatBitSize; + bool HasAnyUndefs; + APInt APSplatBits, APSplatUndef; + BuildVectorSDNode *BVN = cast(N1); + bool BVNIsConstantSplat = + BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, !Subtarget.isLittleEndian()); + if (!BVNIsConstantSplat || SplatBitSize != EltBits) + return SDValue(); + SplatBits = APSplatBits.getZExtValue(); + } + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + // PPC vector shifts by word/double look at only the low 5/6 bits of the + // shift vector, which means the max value is 31/63. A shift vector of all + // 1s will be truncated to 31/63, which is useful as vspltiw is limited to + // -16 to 15 range. + if (SplatBits == (EltBits - 1)) { + unsigned NewOpc; + switch (Opc) { + case ISD::SHL: + NewOpc = PPCISD::SHL; + break; + case ISD::SRL: + NewOpc = PPCISD::SRL; + break; + case ISD::SRA: + NewOpc = PPCISD::SRA; + break; + } + SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL); + return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes); + } + + if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT)) return SDValue(); - uint64_t SplatBits = APSplatBits.getZExtValue(); - if (SplatBits != 1) + + // For 64-bit there is no splat immediate so we want to catch shift by 1 here + // before the BUILD_VECTOR is replaced by a load. + if (EltTy != MVT::i64 || SplatBits != 1) return SDValue(); - SDValue N0 = N->getOperand(0); return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } @@ -18494,7 +18538,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { return Value; if (N->getValueType(0).isVector()) - return combineVectorSHL(N, DCI); + return combineVectorShift(N, DCI); SDValue N0 = N->getOperand(0); ConstantSDNode *CN1 = dyn_cast(N->getOperand(1)); @@ -18526,6 +18570,9 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; + if (N->getValueType(0).isVector()) + return combineVectorShift(N, DCI); + return SDValue(); } @@ -18533,6 +18580,9 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) return Value; + if (N->getValueType(0).isVector()) + return combineVectorShift(N, DCI); + return SDValue(); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index e7e7c21b50395..2c55b5427297a 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1441,7 +1441,7 @@ namespace llvm { SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll index e2ddef8b49758..e3d231adf734f 100644 --- a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll +++ b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll @@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) { ret <4 x i32> %tmp.1 } ; CHECK-LABEL: test7_v4i32: -; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16 -; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15 -; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]] +; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]] ; CHECK-NOT: vmul -; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]] +; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]] define <4 x i32> @test8_v4i32(<4 x i32> %a) { %tmp.1 = mul nsw <4 x i32> %a, ; <<4 x i32>> [#uses=1] ret <4 x i32> %tmp.1 } ; CHECK-LABEL: test8_v4i32: -; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16 -; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15 -; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]] +; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]] ; CHECK-NOT: vmul -; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]] -; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2 +; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]] +; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2 define <2 x i64> @test1_v2i64(<2 x i64> %a) { %tmp.1 = mul nsw <2 x i64> %a, ; <<2 x i64>> [#uses=1] @@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) { } ; CHECK-LABEL: test7_v2i64: -; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}} -; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}}) +; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]] ; CHECK-NOT: vmul ; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]] @@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) { } ; CHECK-LABEL: test8_v2i64: -; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}} -; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}}) +; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]] ; CHECK-NOT: vmul ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]] ; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2 diff --git a/llvm/test/CodeGen/PowerPC/pr47891.ll b/llvm/test/CodeGen/PowerPC/pr47891.ll index 46ff074fae647..6438302d574e6 100644 --- a/llvm/test/CodeGen/PowerPC/pr47891.ll +++ b/llvm/test/CodeGen/PowerPC/pr47891.ll @@ -7,13 +7,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 { ; CHECK-LABEL: poly2_lshift1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis r6, r2, .LCPI0_0@toc@ha +; CHECK-NEXT: ld r6, 0(r3) ; CHECK-NEXT: li r4, 72 ; CHECK-NEXT: ld r5, 64(r3) -; CHECK-NEXT: addi r6, r6, .LCPI0_0@toc@l +; CHECK-NEXT: xxleqv v4, v4, v4 ; CHECK-NEXT: lxvd2x vs0, r3, r4 -; CHECK-NEXT: lxvd2x v4, 0, r6 -; CHECK-NEXT: ld r6, 0(r3) ; CHECK-NEXT: sldi r7, r6, 1 ; CHECK-NEXT: rotldi r6, r6, 1 ; CHECK-NEXT: std r7, 0(r3) @@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 { ; CHECK-NEXT: std r7, 32(r3) ; CHECK-NEXT: ld r7, 40(r3) ; CHECK-NEXT: rldimi r6, r7, 1, 0 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: mtfprd f0, r5 ; CHECK-NEXT: rotldi r7, r7, 1 ; CHECK-NEXT: std r6, 40(r3) ; CHECK-NEXT: ld r6, 48(r3) +; CHECK-NEXT: xxswapd v2, vs0 +; CHECK-NEXT: mtfprd f0, r5 ; CHECK-NEXT: rldimi r7, r6, 1, 0 ; CHECK-NEXT: rotldi r6, r6, 1 ; CHECK-NEXT: std r7, 48(r3) diff --git a/llvm/test/CodeGen/PowerPC/signbit-shift.ll b/llvm/test/CodeGen/PowerPC/signbit-shift.ll index e8cedd47d812d..f8838b50816e1 100644 --- a/llvm/test/CodeGen/PowerPC/signbit-shift.ll +++ b/llvm/test/CodeGen/PowerPC/signbit-shift.ll @@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) { define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) { ; CHECK-LABEL: add_lshr_not_vec_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 ; CHECK-NEXT: addis 3, 2, .LCPI15_0@toc@ha -; CHECK-NEXT: vsubuwm 3, 4, 3 -; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l +; CHECK-NEXT: xxleqv 35, 35, 35 ; CHECK-NEXT: vsraw 2, 2, 3 +; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l ; CHECK-NEXT: lxvd2x 35, 0, 3 ; CHECK-NEXT: vadduwm 2, 2, 3 ; CHECK-NEXT: blr @@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) { define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) { ; CHECK-LABEL: sub_lshr_not_vec_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 ; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; CHECK-NEXT: vsubuwm 3, 4, 3 -; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l +; CHECK-NEXT: xxleqv 35, 35, 35 ; CHECK-NEXT: vsrw 2, 2, 3 +; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l ; CHECK-NEXT: lxvd2x 35, 0, 3 ; CHECK-NEXT: vadduwm 2, 2, 3 ; CHECK-NEXT: blr @@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) { define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: sub_lshr_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 4, -16 -; CHECK-NEXT: vspltisw 5, 15 -; CHECK-NEXT: vsubuwm 4, 5, 4 +; CHECK-NEXT: xxleqv 36, 36, 36 ; CHECK-NEXT: vsraw 2, 2, 4 ; CHECK-NEXT: vadduwm 2, 3, 2 ; CHECK-NEXT: blr @@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) { define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) { ; CHECK-LABEL: sub_const_op_lshr_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 ; CHECK-NEXT: addis 3, 2, .LCPI21_0@toc@ha -; CHECK-NEXT: vsubuwm 3, 4, 3 -; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l +; CHECK-NEXT: xxleqv 35, 35, 35 ; CHECK-NEXT: vsraw 2, 2, 3 +; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l ; CHECK-NEXT: lxvd2x 35, 0, 3 ; CHECK-NEXT: vadduwm 2, 2, 3 ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vselect-constants.ll b/llvm/test/CodeGen/PowerPC/vselect-constants.ll index b72142943dd8b..e65d28188a88f 100644 --- a/llvm/test/CodeGen/PowerPC/vselect-constants.ll +++ b/llvm/test/CodeGen/PowerPC/vselect-constants.ll @@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_C1_or_C2_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 +; CHECK-NEXT: xxleqv 37, 37, 37 +; CHECK-NEXT: vslw 2, 2, 5 ; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: vsubuwm 3, 4, 3 +; CHECK-NEXT: vsraw 2, 2, 5 ; CHECK-NEXT: lxvd2x 0, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha -; CHECK-NEXT: vslw 2, 2, 3 ; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l -; CHECK-NEXT: vsraw 2, 2, 3 -; CHECK-NEXT: xxswapd 37, 0 +; CHECK-NEXT: xxswapd 35, 0 ; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: xxswapd 32, 0 -; CHECK-NEXT: xxsel 34, 32, 37, 34 +; CHECK-NEXT: xxswapd 36, 0 +; CHECK-NEXT: xxsel 34, 36, 35, 34 ; CHECK-NEXT: blr %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> ret <4 x i32> %add @@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_Cminus1_or_C_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 +; CHECK-NEXT: xxleqv 36, 36, 36 +; CHECK-NEXT: vslw 2, 2, 4 ; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l -; CHECK-NEXT: vsubuwm 3, 4, 3 +; CHECK-NEXT: vsraw 2, 2, 4 ; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: vslw 2, 2, 3 -; CHECK-NEXT: vsraw 2, 2, 3 -; CHECK-NEXT: xxswapd 37, 0 -; CHECK-NEXT: vadduwm 2, 2, 5 +; CHECK-NEXT: xxswapd 35, 0 +; CHECK-NEXT: vadduwm 2, 2, 3 ; CHECK-NEXT: blr %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> ret <4 x i32> %add @@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_minus1_or_0_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 3, -16 -; CHECK-NEXT: vspltisw 4, 15 -; CHECK-NEXT: vsubuwm 3, 4, 3 +; CHECK-NEXT: xxleqv 35, 35, 35 ; CHECK-NEXT: vslw 2, 2, 3 ; CHECK-NEXT: vsraw 2, 2, 3 ; CHECK-NEXT: blr