diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b703eb90e8ef3..affc29ec18ff7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -758,9 +758,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, Custom); setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction( - {ISD::SELECT_CC, ISD::VSELECT, ISD::VP_MERGE, ISD::VP_SELECT}, VT, - Expand); + setOperationAction({ISD::SELECT_CC, ISD::VSELECT, ISD::VP_SELECT}, VT, + Expand); + setOperationAction(ISD::VP_MERGE, VT, Custom); setOperationAction({ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, VT, Custom); @@ -1237,6 +1237,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SETCC, ISD::VP_TRUNCATE}, VT, Custom); + setOperationAction(ISD::VP_MERGE, VT, Custom); + setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom); setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom); continue; @@ -7492,8 +7494,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerSET_ROUNDING(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); - case ISD::VP_SELECT: case ISD::VP_MERGE: + if (Op.getSimpleValueType().getVectorElementType() == MVT::i1) + return lowerVPMergeMask(Op, DAG); + [[fallthrough]]; + case ISD::VP_SELECT: case ISD::VP_ADD: case ISD::VP_SUB: case ISD::VP_MUL: @@ -12078,6 +12083,65 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op, return convertFromScalableVector(VT, Result, DAG, Subtarget); } +SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + MVT XLenVT = Subtarget.getXLenVT(); + + SDValue Mask = Op.getOperand(0); + SDValue TrueVal = Op.getOperand(1); + SDValue FalseVal = Op.getOperand(2); + SDValue VL = Op.getOperand(3); + + // Use default legalization if a vector of EVL type would be legal. + EVT EVLVecVT = EVT::getVectorVT(*DAG.getContext(), VL.getValueType(), + VT.getVectorElementCount()); + if (isTypeLegal(EVLVecVT)) + return SDValue(); + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + Mask = convertToScalableVector(ContainerVT, Mask, DAG, Subtarget); + TrueVal = convertToScalableVector(ContainerVT, TrueVal, DAG, Subtarget); + FalseVal = convertToScalableVector(ContainerVT, FalseVal, DAG, Subtarget); + } + + // Promote to a vector of i8. + MVT PromotedVT = ContainerVT.changeVectorElementType(MVT::i8); + + // Promote TrueVal and FalseVal using VLMax. + // FIXME: Is there a better way to do this? + SDValue VLMax = DAG.getRegister(RISCV::X0, XLenVT); + SDValue SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, PromotedVT, + DAG.getUNDEF(PromotedVT), + DAG.getConstant(1, DL, XLenVT), VLMax); + SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, PromotedVT, + DAG.getUNDEF(PromotedVT), + DAG.getConstant(0, DL, XLenVT), VLMax); + TrueVal = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, TrueVal, SplatOne, + SplatZero, DAG.getUNDEF(PromotedVT), VL); + // Any element past VL uses FalseVal, so use VLMax + FalseVal = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, FalseVal, + SplatOne, SplatZero, DAG.getUNDEF(PromotedVT), VLMax); + + // VP_MERGE the two promoted values. + SDValue VPMerge = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, Mask, + TrueVal, FalseVal, FalseVal, VL); + + // Convert back to mask. + SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL); + SDValue Result = DAG.getNode( + RISCVISD::SETCC_VL, DL, ContainerVT, + {VPMerge, DAG.getConstant(0, DL, PromotedVT), DAG.getCondCode(ISD::SETNE), + DAG.getUNDEF(getMaskTypeFor(ContainerVT)), TrueMask, VLMax}); + + if (VT.isFixedLengthVector()) + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + return Result; +} + SDValue RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 0944bb8793a94..4c78fd784a3c8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -996,6 +996,7 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVPMergeMask(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSplatExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index a53d33e6120d5..6394542479d1b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -58,6 +58,182 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze ret <4 x i1> %v } +define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_vv_v8i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmsltu.vx v12, v10, a0 +; RV32-NEXT: vmand.mm v9, v9, v12 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vv_v8i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vid.v v12 +; RV64-NEXT: vmsltu.vx v10, v12, a0 +; RV64-NEXT: vmand.mm v9, v9, v10 +; RV64-NEXT: vmandn.mm v8, v8, v9 +; RV64-NEXT: vmand.mm v9, v0, v9 +; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vpmerge_vv_v8i1: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32ZVFHMIN-NEXT: vid.v v10 +; RV32ZVFHMIN-NEXT: vmsltu.vx v12, v10, a0 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v12 +; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vpmerge_vv_v8i1: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vid.v v12 +; RV64ZVFHMIN-NEXT: vmsltu.vx v10, v12, a0 +; RV64ZVFHMIN-NEXT: vmand.mm v9, v9, v10 +; RV64ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 +; RV64ZVFHMIN-NEXT: vmand.mm v9, v0, v9 +; RV64ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV64ZVFHMIN-NEXT: ret + %v = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> %m, <8 x i1> %va, <8 x i1> %vb, i32 %evl) + ret <8 x i1> %v +} + +define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_vv_v16i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: vmsltu.vx v10, v12, a0 +; RV32-NEXT: vmand.mm v9, v9, v10 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vv_v16i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vid.v v16 +; RV64-NEXT: vmsltu.vx v10, v16, a0 +; RV64-NEXT: vmand.mm v9, v9, v10 +; RV64-NEXT: vmandn.mm v8, v8, v9 +; RV64-NEXT: vmand.mm v9, v0, v9 +; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vpmerge_vv_v16i1: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32ZVFHMIN-NEXT: vid.v v12 +; RV32ZVFHMIN-NEXT: vmsltu.vx v10, v12, a0 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v10 +; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vpmerge_vv_v16i1: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vid.v v16 +; RV64ZVFHMIN-NEXT: vmsltu.vx v10, v16, a0 +; RV64ZVFHMIN-NEXT: vmand.mm v9, v9, v10 +; RV64ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 +; RV64ZVFHMIN-NEXT: vmand.mm v9, v0, v9 +; RV64ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV64ZVFHMIN-NEXT: ret + %v = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> %m, <16 x i1> %va, <16 x i1> %vb, i32 %evl) + ret <16 x i1> %v +} + +define <32 x i1> @vpmerge_vv_v32i1(<32 x i1> %va, <32 x i1> %vb, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_vv_v32i1: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vid.v v16 +; RV32-NEXT: vmsltu.vx v10, v16, a0 +; RV32-NEXT: vmand.mm v9, v9, v10 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vv_v32i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vmerge.vim v12, v10, 1, v0 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; RV64-NEXT: vmerge.vvm v10, v10, v12, v0 +; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vpmerge_vv_v32i1: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a1, 32 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32ZVFHMIN-NEXT: vid.v v16 +; RV32ZVFHMIN-NEXT: vmsltu.vx v10, v16, a0 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v10 +; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vpmerge_vv_v32i1: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv.v.i v10, 0 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmerge.vim v12, v10, 1, v0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v8 +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v9 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; RV64ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmsne.vi v0, v10, 0 +; RV64ZVFHMIN-NEXT: ret + %v = call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> %m, <32 x i1> %va, <32 x i1> %vb, i32 %evl) + ret <32 x i1> %v +} + +define <64 x i1> @vpmerge_vv_v64i1(<64 x i1> %va, <64 x i1> %vb, <64 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpmerge_vv_v64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: ret + %v = call <64 x i1> @llvm.vp.merge.v64i1(<64 x i1> %m, <64 x i1> %va, <64 x i1> %vb, i32 %evl) + ret <64 x i1> %v +} + declare <2 x i8> @llvm.vp.merge.v2i8(<2 x i1>, <2 x i8>, <2 x i8>, i32) define <2 x i8> @vpmerge_vv_v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 zeroext %evl) { @@ -1188,10 +1364,10 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3 ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB79_2 +; CHECK-NEXT: bltu a2, a1, .LBB83_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: .LBB79_2: +; CHECK-NEXT: .LBB83_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: addi a0, a2, -16 @@ -1221,10 +1397,10 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1> ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB80_2 +; CHECK-NEXT: bltu a0, a2, .LBB84_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB80_2: +; CHECK-NEXT: .LBB84_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma ; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; CHECK-NEXT: addi a1, a0, -16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll index 88a8ebcc90054..4cd77185e6930 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll @@ -35,6 +35,205 @@ define @vpmerge_nxv1i1( %va, %v } +define @vpmerge_nxv2i1( %va, %vb, %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_nxv2i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmsltu.vx v10, v10, a0 +; RV32-NEXT: vmand.mm v9, v9, v10 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_nxv2i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vid.v v10 +; RV64-NEXT: vmsltu.vx v12, v10, a0 +; RV64-NEXT: vmand.mm v9, v9, v12 +; RV64-NEXT: vmandn.mm v8, v8, v9 +; RV64-NEXT: vmand.mm v9, v0, v9 +; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: ret + %v = call @llvm.vp.merge.nxv2i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv4i1( %va, %vb, %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_nxv4i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmsltu.vx v12, v10, a0 +; RV32-NEXT: vmand.mm v9, v9, v12 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_nxv4i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64-NEXT: vid.v v12 +; RV64-NEXT: vmsltu.vx v10, v12, a0 +; RV64-NEXT: vmand.mm v9, v9, v10 +; RV64-NEXT: vmandn.mm v8, v8, v9 +; RV64-NEXT: vmand.mm v9, v0, v9 +; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: ret + %v = call @llvm.vp.merge.nxv4i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv8i1( %va, %vb, %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_nxv8i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: vmsltu.vx v10, v12, a0 +; RV32-NEXT: vmand.mm v9, v9, v10 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_nxv8i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vid.v v16 +; RV64-NEXT: vmsltu.vx v10, v16, a0 +; RV64-NEXT: vmand.mm v9, v9, v10 +; RV64-NEXT: vmandn.mm v8, v8, v9 +; RV64-NEXT: vmand.mm v9, v0, v9 +; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: ret + %v = call @llvm.vp.merge.nxv8i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv16i1( %va, %vb, %m, i32 zeroext %evl) { +; RV32-LABEL: vpmerge_nxv16i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vid.v v16 +; RV32-NEXT: vmsltu.vx v10, v16, a0 +; RV32-NEXT: vmand.mm v9, v9, v10 +; RV32-NEXT: vmandn.mm v8, v8, v9 +; RV32-NEXT: vmand.mm v9, v0, v9 +; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_nxv16i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; RV64-NEXT: vmerge.vim v12, v10, 1, v0 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vsetvli zero, a0, e8, m2, tu, ma +; RV64-NEXT: vmerge.vvm v10, v10, v12, v0 +; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: ret + %v = call @llvm.vp.merge.nxv16i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv32i1( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpmerge_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: ret + %v = call @llvm.vp.merge.nxv32i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv64i1( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpmerge_nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; CHECK-NEXT: vmsne.vi v0, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.vp.merge.nxv64i1( %m, %va, %vb, i32 %evl) + ret %v +} + +define @vpmerge_nxv128i1( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpmerge_nxv128i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v7, v12 +; CHECK-NEXT: vmv1r.v v4, v11 +; CHECK-NEXT: vmv1r.v v6, v10 +; CHECK-NEXT: vmv1r.v v3, v9 +; CHECK-NEXT: vmv1r.v v5, v8 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: sub a2, a0, a2 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vsetvli a3, zero, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: sltu a0, a0, a2 +; CHECK-NEXT: vmv1r.v v0, v4 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; CHECK-NEXT: vmsne.vi v9, v16, 0 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: vmv1r.v v0, v5 +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma +; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma +; CHECK-NEXT: vmsne.vi v8, v24, 0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: ret + %v = call @llvm.vp.merge.nxv128i1( %m, %va, %vb, i32 %evl) + ret %v +} + declare @llvm.vp.merge.nxv1i8(, , , i32) define @vpmerge_vv_nxv1i8( %va, %vb, %m, i32 zeroext %evl) { @@ -378,10 +577,10 @@ define @vpmerge_vv_nxv128i8( %va, @vpmerge_vx_nxv128i8(i8 %a, %vb, ; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma ; CHECK-NEXT: vmerge.vxm v16, v16, a0, v0 -; CHECK-NEXT: bltu a2, a1, .LBB29_2 +; CHECK-NEXT: bltu a2, a1, .LBB36_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: .LBB36_2: ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 @@ -440,10 +639,10 @@ define @vpmerge_vi_nxv128i8( %vb,