From bc5bdeac46b8295f3a43a033dafa6061c6e82814 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Tue, 10 Oct 2023 10:35:01 -0700 Subject: [PATCH 1/4] [RISCV][CostModel] Recommit VPIntrinsics have same cost as their non-vp counterparts This was reverted in commit 0abaf3caee88ae74def2c7000aff8e61b24634bb (#67178). This version of the patch includes a fix which was caused by vp-reductions having an extra start value argument which the non-vp counterparts did not have. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 56 +++ llvm/test/Analysis/CostModel/RISCV/gep.ll | 8 +- .../CostModel/RISCV/rvv-intrinsics.ll | 370 +++++++++++++++++- 3 files changed, 429 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 3dd16dafe3c42..2d0d10982aff6 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1691,6 +1691,62 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } } + // VP Intrinsics should have the same cost as their non-vp counterpart. + // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp + // counterpart when the vector length argument is smaller than the maximum + // vector length. + if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { + std::optional FOp = + VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); + if (FOp) { + // TODO: Support other kinds of Intrinsics (i.e. reductions) + if (ICA.getID() == Intrinsic::vp_load) { + Align Alignment; + if (auto *VPI = dyn_cast_or_null(ICA.getInst())) + Alignment = VPI->getPointerAlignment().valueOrOne(); + unsigned AS = 0; + if (ICA.getArgs().size() > 1) + if (auto *PtrTy = + dyn_cast(ICA.getArgs()[0]->getType())) + AS = PtrTy->getAddressSpace(); + return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, + AS, CostKind); + } + if (ICA.getID() == Intrinsic::vp_store) { + Align Alignment; + if (auto *VPI = dyn_cast_or_null(ICA.getInst())) + Alignment = VPI->getPointerAlignment().valueOrOne(); + unsigned AS = 0; + if (ICA.getArgs().size() >= 2) + if (auto *PtrTy = + dyn_cast(ICA.getArgs()[1]->getType())) + AS = PtrTy->getAddressSpace(); + return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, + AS, CostKind); + } + if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { + return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), + CostKind); + } + } + + std::optional FID = + VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); + if (FID) { + // Non-vp version will have same Args/Tys except mask and vector length. + assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && + "Expected VPIntrinsic to have Mask and Vector Length args and " + "types"); + ArrayRef NewArgs = ArrayRef(ICA.getArgs()).drop_back(2); + ArrayRef NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); + + IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs, + NewTys, ICA.getFlags(), ICA.getInst(), + ICA.getScalarizationCost()); + return thisT()->getIntrinsicInstrCost(NewICA, CostKind); + } + } + // Assume that we need to scalarize this intrinsic. // Compute the scalarization overhead based on Args for a vector // intrinsic. diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll index be518faf7e051..4fadf34c1973f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/gep.ll +++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll @@ -270,7 +270,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) { ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42 ; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42 -; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef) +; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42 ; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42 @@ -282,7 +282,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) { ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42 ; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42 -; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef) +; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42 ; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -340,7 +340,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) { ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0 ; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0 -; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef) +; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0 ; RVI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %x6 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %6, i64 undef, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0 @@ -352,7 +352,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) { ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0 ; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0 -; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef) +; RVI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0 ; RVI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr %12, i64 undef, <2 x i1> undef, i32 undef) ; RVI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 93de623cf1c6d..85364c935267d 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -206,10 +206,378 @@ define void @vp_fshl() { ret void } +define void @add() { +; CHECK-LABEL: 'add' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = add <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = add <4 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t5 = add <8 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t7 = add <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = add <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = add <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t14 = add <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t16 = add <16 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t17 = call @llvm.vp.add.nxv2i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t18 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t19 = call @llvm.vp.add.nxv4i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t20 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t21 = call @llvm.vp.add.nxv8i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t22 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t23 = call @llvm.vp.add.nxv16i8( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t24 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t25 = call @llvm.vp.add.nxv2i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t26 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t27 = call @llvm.vp.add.nxv4i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t28 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t29 = call @llvm.vp.add.nxv8i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t30 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call @llvm.vp.add.nxv16i64( undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t32 = add undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %t0 = call <2 x i8> @llvm.vp.add.v2i8(<2 x i8> undef, <2 x i8> undef, <2 x i1> undef, i32 undef) + %t1 = add <2 x i8> undef, undef + %t2 = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> undef, <4 x i8> undef, <4 x i1> undef, i32 undef) + %t3 = add <4 x i8> undef, undef + %t4 = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> undef, <8 x i8> undef, <8 x i1> undef, i32 undef) + %t5 = add <8 x i8> undef, undef + %t6 = call <16 x i8> @llvm.vp.add.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef, i32 undef) + %t7 = add <16 x i8> undef, undef + %t8 = call <2 x i64> @llvm.vp.add.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef, i32 undef) + %t9 = add <2 x i64> undef, undef + %t10 = call <4 x i64> @llvm.vp.add.v4i64(<4 x i64> undef, <4 x i64> undef, <4 x i1> undef, i32 undef) + %t12 = add <4 x i64> undef, undef + %t13 = call <8 x i64> @llvm.vp.add.v8i64(<8 x i64> undef, <8 x i64> undef, <8 x i1> undef, i32 undef) + %t14 = add <8 x i64> undef, undef + %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef) + %t16 = add <16 x i64> undef, undef + %t17 = call @llvm.vp.add.nv2i8( undef, undef, undef, i32 undef) + %t18 = add undef, undef + %t19 = call @llvm.vp.add.nv4i8( undef, undef, undef, i32 undef) + %t20 = add undef, undef + %t21 = call @llvm.vp.add.nv8i8( undef, undef, undef, i32 undef) + %t22 = add undef, undef + %t23 = call @llvm.vp.add.nv16i8( undef, undef, undef, i32 undef) + %t24 = add undef, undef + %t25 = call @llvm.vp.add.nv2i64( undef, undef, undef, i32 undef) + %t26 = add undef, undef + %t27 = call @llvm.vp.add.nv4i64( undef, undef, undef, i32 undef) + %t28 = add undef, undef + %t29 = call @llvm.vp.add.nv8i64( undef, undef, undef, i32 undef) + %t30 = add undef, undef + %t31 = call @llvm.vp.add.nv16i64( undef, undef, undef, i32 undef) + %t32 = add undef, undef + ret void +} + +define void @abs() { +; CHECK-LABEL: 'abs' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 false, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %4 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %17 = call @llvm.vp.abs.nxv2i8( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = call @llvm.abs.nxv2i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = call @llvm.vp.abs.nxv4i8( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call @llvm.abs.nxv4i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %21 = call @llvm.vp.abs.nxv8i8( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %22 = call @llvm.abs.nxv8i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %23 = call @llvm.vp.abs.nxv16i8( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = call @llvm.abs.nxv16i8( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = call @llvm.vp.abs.nxv2i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %26 = call @llvm.abs.nxv2i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call @llvm.vp.abs.nxv4i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call @llvm.abs.nxv4i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call @llvm.vp.abs.nxv8i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call @llvm.abs.nxv8i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = call @llvm.vp.abs.nxv16i64( undef, i1 false, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %32 = call @llvm.abs.nxv16i64( undef, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call <2 x i8> @llvm.vp.abs.v2i8(<2 x i8> undef, i1 0, <2 x i1> undef, i32 undef) + call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 0) + call <4 x i8> @llvm.vp.abs.v4i8(<4 x i8> undef, i1 0, <4 x i1> undef, i32 undef) + call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 0) + call <8 x i8> @llvm.vp.abs.v8i8(<8 x i8> undef, i1 0, <8 x i1> undef, i32 undef) + call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 0) + call <16 x i8> @llvm.vp.abs.v16i8(<16 x i8> undef, i1 0, <16 x i1> undef, i32 undef) + call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 0) + call <2 x i64> @llvm.vp.abs.v2i64(<2 x i64> undef, i1 0, <2 x i1> undef, i32 undef) + call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 0) + call <4 x i64> @llvm.vp.abs.v4i64(<4 x i64> undef, i1 0, <4 x i1> undef, i32 undef) + call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 0) + call <8 x i64> @llvm.vp.abs.v8i64(<8 x i64> undef, i1 0, <8 x i1> undef, i32 undef) + call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0) + call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef) + call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0) + call @llvm.vp.abs.nv2i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv2i8( undef, i1 0) + call @llvm.vp.abs.nv4i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv4i8( undef, i1 0) + call @llvm.vp.abs.nv8i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv8i8( undef, i1 0) + call @llvm.vp.abs.nv16i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv16i8( undef, i1 0) + call @llvm.vp.abs.nv2i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv2i64( undef, i1 0) + call @llvm.vp.abs.nv4i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv4i64( undef, i1 0) + call @llvm.vp.abs.nv8i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv8i64( undef, i1 0) + call @llvm.vp.abs.nv16i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nv16i64( undef, i1 0) + ret void +} + +define void @load() { +; CHECK-LABEL: 'load' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t17 = call @llvm.vp.load.nxv2i8.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t18 = load , ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t19 = call @llvm.vp.load.nxv4i8.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t20 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t21 = call @llvm.vp.load.nxv8i8.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t22 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t23 = call @llvm.vp.load.nxv16i8.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t24 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t25 = call @llvm.vp.load.nxv2i64.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t26 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t27 = call @llvm.vp.load.nxv4i64.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t28 = load , ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t29 = call @llvm.vp.load.nxv8i64.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t30 = load , ptr undef, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call @llvm.vp.load.nxv16i64.p0(ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t32 = load , ptr undef, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef) + %t1 = load <2 x i8>, ptr undef + %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef) + %t3 = load <4 x i8>, ptr undef + %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr undef, <8 x i1> undef, i32 undef) + %t5 = load <8 x i8>, ptr undef + %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr undef, <16 x i1> undef, i32 undef) + %t7 = load <16 x i8>, ptr undef + %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr undef, <2 x i1> undef, i32 undef) + %t9 = load <2 x i64>, ptr undef + %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr undef, <4 x i1> undef, i32 undef) + %t12 = load <4 x i64>, ptr undef + %t13 = call <8 x i64> @llvm.vp.load.v8i64(ptr undef, <8 x i1> undef, i32 undef) + %t14 = load <8 x i64>, ptr undef + %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef) + %t16 = load <16 x i64>, ptr undef + %t17 = call @llvm.vp.load.nv2i8(ptr undef, undef, i32 undef) + %t18 = load , ptr undef + %t19 = call @llvm.vp.load.nv4i8(ptr undef, undef, i32 undef) + %t20 = load , ptr undef + %t21 = call @llvm.vp.load.nv8i8(ptr undef, undef, i32 undef) + %t22 = load , ptr undef + %t23 = call @llvm.vp.load.nv16i8(ptr undef, undef, i32 undef) + %t24 = load , ptr undef + %t25 = call @llvm.vp.load.nv2i64(ptr undef, undef, i32 undef) + %t26 = load , ptr undef + %t27 = call @llvm.vp.load.nv4i64(ptr undef, undef, i32 undef) + %t28 = load , ptr undef + %t29 = call @llvm.vp.load.nv8i64(ptr undef, undef, i32 undef) + %t30 = load , ptr undef + %t31 = call @llvm.vp.load.nv16i64(ptr undef, undef, i32 undef) + %t32 = load , ptr undef + ret void +} + +define void @store() { +; CHECK-LABEL: 'store' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v4i8.p0(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v8i8.p0(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> undef, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v16i8.p0(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i64.p0(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.v4i64.p0(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.v8i64.p0(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> undef, ptr undef, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.v16i64.p0(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <16 x i64> undef, ptr undef, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv2i8.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv4i8.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.nxv8i8.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv16i8.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.vp.store.nxv2i64.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.vp.store.nxv4i64.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store undef, ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.vp.store.nxv8i64.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store undef, ptr undef, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.vp.store.nxv16i64.p0( undef, ptr undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store undef, ptr undef, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call void @llvm.vp.store.v2i8(<2 x i8> undef, ptr undef, <2 x i1> undef, i32 undef) + store <2 x i8> undef, ptr undef + call void @llvm.vp.store.v4i8(<4 x i8> undef, ptr undef, <4 x i1> undef, i32 undef) + store <4 x i8> undef, ptr undef + call void @llvm.vp.store.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef, i32 undef) + store <8 x i8> undef, ptr undef + call void @llvm.vp.store.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef, i32 undef) + store <16 x i8> undef, ptr undef + call void @llvm.vp.store.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef, i32 undef) + store <2 x i64> undef, ptr undef + call void @llvm.vp.store.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef, i32 undef) + store <4 x i64> undef, ptr undef + call void @llvm.vp.store.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef, i32 undef) + store <8 x i64> undef, ptr undef + call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef) + store <16 x i64> undef, ptr undef + call void @llvm.vp.store.nv2i8( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv4i8( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv8i8( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv16i8( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv2i64( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv4i64( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv8i64( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + call void @llvm.vp.store.nv16i64( undef, ptr undef, undef, i32 undef) + store undef, ptr undef + ret void +} + +declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) +declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) +declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) +declare <16 x i8> @llvm.vp.add.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32) +declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) +declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) +declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) +declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) +declare @llvm.vp.add.nv2i8(, , , i32) +declare @llvm.vp.add.nv4i8(, , , i32) +declare @llvm.vp.add.nv8i8(, , , i32) +declare @llvm.vp.add.nv16i8(, , , i32) +declare @llvm.vp.add.nv2i64(, , , i32) +declare @llvm.vp.add.nv4i64(, , , i32) +declare @llvm.vp.add.nv8i64(, , , i32) +declare @llvm.vp.add.nv16i64(, , , i32) + +declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32) +declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32) +declare <8 x i8> @llvm.vp.abs.v8i8(<8 x i8>, i1, <8 x i1>, i32) +declare <16 x i8> @llvm.vp.abs.v16i8(<16 x i8>, i1, <16 x i1>, i32) +declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32) +declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32) +declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32) +declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32) +declare @llvm.vp.abs.nv2i8(, i1, , i32) +declare @llvm.vp.abs.nv4i8(, i1, , i32) +declare @llvm.vp.abs.nv8i8(, i1, , i32) +declare @llvm.vp.abs.nv16i8(, i1, , i32) +declare @llvm.vp.abs.nv2i64(, i1, , i32) +declare @llvm.vp.abs.nv4i64(, i1, , i32) +declare @llvm.vp.abs.nv8i64(, i1, , i32) +declare @llvm.vp.abs.nv16i64(, i1, , i32) + +declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1) +declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1) +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare @llvm.abs.nv2i8(, i1) +declare @llvm.abs.nv4i8(, i1) +declare @llvm.abs.nv8i8(, i1) +declare @llvm.abs.nv16i8(, i1) +declare @llvm.abs.nv2i64(, i1) +declare @llvm.abs.nv4i64(, i1) +declare @llvm.abs.nv8i64(, i1) +declare @llvm.abs.nv16i64(, i1) + +declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32) +declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32) +declare <8 x i8> @llvm.vp.load.v8i8(ptr, <8 x i1>, i32) +declare <16 x i8> @llvm.vp.load.v16i8(ptr, <16 x i1>, i32) +declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32) +declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32) +declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32) +declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32) +declare @llvm.vp.load.nv2i8(ptr, , i32) +declare @llvm.vp.load.nv4i8(ptr, , i32) +declare @llvm.vp.load.nv8i8(ptr, , i32) +declare @llvm.vp.load.nv16i8(ptr, , i32) +declare @llvm.vp.load.nv2i64(ptr, , i32) +declare @llvm.vp.load.nv4i64(ptr, , i32) +declare @llvm.vp.load.nv8i64(ptr, , i32) +declare @llvm.vp.load.nv16i64(ptr, , i32) + +declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32) +declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32) +declare void @llvm.vp.store.v8i8(<8 x i8>, ptr, <8 x i1>, i32) +declare void @llvm.vp.store.v16i8(<16 x i8>, ptr, <16 x i1>, i32) +declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32) +declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32) +declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32) +declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32) +declare void @llvm.vp.store.nv2i8(, ptr, , i32) +declare void @llvm.vp.store.nv4i8(, ptr, , i32) +declare void @llvm.vp.store.nv8i8(, ptr, , i32) +declare void @llvm.vp.store.nv16i8(, ptr, , i32) +declare void @llvm.vp.store.nv2i64(, ptr, , i32) +declare void @llvm.vp.store.nv4i64(, ptr, , i32) +declare void @llvm.vp.store.nv8i64(, ptr, , i32) +declare void @llvm.vp.store.nv16i64(, ptr, , i32) + declare @llvm.fshr.nxv4i32( %a, %b, %c) declare @llvm.fshl.nxv4i32( %a, %b, %c) - declare @llvm.pow.nxv4f32(, ) declare @llvm.powi.nxv4f32.i32(, i32) declare @llvm.nearbyint.nxv4f32() From 18fa7ac3359093b1737541532a5db0b67e9fa708 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Tue, 10 Oct 2023 12:49:07 -0700 Subject: [PATCH 2/4] [CostModel] Cost VPReductions as non-vp counterpart There was a bug that occured due to VPReduction having a start value argument that the non-vp counterpart did not have. Once this bug was fixed, I discovered that non-vp reductions get costed using Intrinsic::not_intrinsic. This should be fixed by a future patch, but for now the VP version of the intrinsic is costed in the same way. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 27 +- .../CostModel/RISCV/rvv-intrinsics.ll | 370 ++++++++++++++---- 2 files changed, 311 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2d0d10982aff6..f87bfa1f5ee28 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1695,11 +1695,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp // counterpart when the vector length argument is smaller than the maximum // vector length. + // TODO: Support other kinds of VPIntrinsics if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); if (FOp) { - // TODO: Support other kinds of Intrinsics (i.e. reductions) if (ICA.getID() == Intrinsic::vp_load) { Align Alignment; if (auto *VPI = dyn_cast_or_null(ICA.getInst())) @@ -1737,17 +1737,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && "Expected VPIntrinsic to have Mask and Vector Length args and " "types"); - ArrayRef NewArgs = ArrayRef(ICA.getArgs()).drop_back(2); ArrayRef NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); - IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs, - NewTys, ICA.getFlags(), ICA.getInst(), - ICA.getScalarizationCost()); + // FIXME: it looks like non-vp reductions are costed using the + // Intrinsic::not_intrinsic opcode in the cost model. In the future, + // they should use the correct intrinsic opcode. The approach for + // costing VPIntrinsics is to cost them as their non-vp counterpart so + // we use Intrinsic::not_intrinsic below, however this must change when + // non-vp reductions use the correct ID. + if (VPReductionIntrinsic::isVPReduction(ICA.getID())) + FID = std::make_optional(Intrinsic::not_intrinsic); + + // VPReduction intrinsics have a start value argument that their non-vp + // counterparts do not have, except for the fadd and fmul non-vp + // counterpart. + if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && + *FID != Intrinsic::vector_reduce_fadd && + *FID != Intrinsic::vector_reduce_fmul) + NewTys = NewTys.drop_front(); + + IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, + ICA.getFlags()); return thisT()->getIntrinsicInstrCost(NewICA, CostKind); } } - // Assume that we need to scalarize this intrinsic. + // Assume that we need to scalarize this intrinsic.) // Compute the scalarization overhead based on Args for a vector // intrinsic. InstructionCost ScalarizationCost = InstructionCost::getInvalid(); diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 85364c935267d..5707db18cfe92 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -258,21 +258,21 @@ define void @add() { %t14 = add <8 x i64> undef, undef %t15 = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> undef, <16 x i64> undef, <16 x i1> undef, i32 undef) %t16 = add <16 x i64> undef, undef - %t17 = call @llvm.vp.add.nv2i8( undef, undef, undef, i32 undef) + %t17 = call @llvm.vp.add.nxv2i8( undef, undef, undef, i32 undef) %t18 = add undef, undef - %t19 = call @llvm.vp.add.nv4i8( undef, undef, undef, i32 undef) + %t19 = call @llvm.vp.add.nxv4i8( undef, undef, undef, i32 undef) %t20 = add undef, undef - %t21 = call @llvm.vp.add.nv8i8( undef, undef, undef, i32 undef) + %t21 = call @llvm.vp.add.nxv8i8( undef, undef, undef, i32 undef) %t22 = add undef, undef - %t23 = call @llvm.vp.add.nv16i8( undef, undef, undef, i32 undef) + %t23 = call @llvm.vp.add.nxv16i8( undef, undef, undef, i32 undef) %t24 = add undef, undef - %t25 = call @llvm.vp.add.nv2i64( undef, undef, undef, i32 undef) + %t25 = call @llvm.vp.add.nxv2i64( undef, undef, undef, i32 undef) %t26 = add undef, undef - %t27 = call @llvm.vp.add.nv4i64( undef, undef, undef, i32 undef) + %t27 = call @llvm.vp.add.nxv4i64( undef, undef, undef, i32 undef) %t28 = add undef, undef - %t29 = call @llvm.vp.add.nv8i64( undef, undef, undef, i32 undef) + %t29 = call @llvm.vp.add.nxv8i64( undef, undef, undef, i32 undef) %t30 = add undef, undef - %t31 = call @llvm.vp.add.nv16i64( undef, undef, undef, i32 undef) + %t31 = call @llvm.vp.add.nxv16i64( undef, undef, undef, i32 undef) %t32 = add undef, undef ret void } @@ -329,22 +329,22 @@ define void @abs() { call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 0) call <16 x i64> @llvm.vp.abs.v16i64(<16 x i64> undef, i1 0, <16 x i1> undef, i32 undef) call <16 x i64> @llvm.abs.v16i64(<16 x i64> undef, i1 0) - call @llvm.vp.abs.nv2i8( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv2i8( undef, i1 0) - call @llvm.vp.abs.nv4i8( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv4i8( undef, i1 0) - call @llvm.vp.abs.nv8i8( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv8i8( undef, i1 0) - call @llvm.vp.abs.nv16i8( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv16i8( undef, i1 0) - call @llvm.vp.abs.nv2i64( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv2i64( undef, i1 0) - call @llvm.vp.abs.nv4i64( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv4i64( undef, i1 0) - call @llvm.vp.abs.nv8i64( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv8i64( undef, i1 0) - call @llvm.vp.abs.nv16i64( undef, i1 0, undef, i32 undef) - call @llvm.abs.nv16i64( undef, i1 0) + call @llvm.vp.abs.nxv2i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv2i8( undef, i1 0) + call @llvm.vp.abs.nxv4i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv4i8( undef, i1 0) + call @llvm.vp.abs.nxv8i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv8i8( undef, i1 0) + call @llvm.vp.abs.nxv16i8( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv16i8( undef, i1 0) + call @llvm.vp.abs.nxv2i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv2i64( undef, i1 0) + call @llvm.vp.abs.nxv4i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv4i64( undef, i1 0) + call @llvm.vp.abs.nxv8i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv8i64( undef, i1 0) + call @llvm.vp.abs.nxv16i64( undef, i1 0, undef, i32 undef) + call @llvm.abs.nxv16i64( undef, i1 0) ret void } @@ -400,21 +400,21 @@ define void @load() { %t14 = load <8 x i64>, ptr undef %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef) %t16 = load <16 x i64>, ptr undef - %t17 = call @llvm.vp.load.nv2i8(ptr undef, undef, i32 undef) + %t17 = call @llvm.vp.load.nxv2i8(ptr undef, undef, i32 undef) %t18 = load , ptr undef - %t19 = call @llvm.vp.load.nv4i8(ptr undef, undef, i32 undef) + %t19 = call @llvm.vp.load.nxv4i8(ptr undef, undef, i32 undef) %t20 = load , ptr undef - %t21 = call @llvm.vp.load.nv8i8(ptr undef, undef, i32 undef) + %t21 = call @llvm.vp.load.nxv8i8(ptr undef, undef, i32 undef) %t22 = load , ptr undef - %t23 = call @llvm.vp.load.nv16i8(ptr undef, undef, i32 undef) + %t23 = call @llvm.vp.load.nxv16i8(ptr undef, undef, i32 undef) %t24 = load , ptr undef - %t25 = call @llvm.vp.load.nv2i64(ptr undef, undef, i32 undef) + %t25 = call @llvm.vp.load.nxv2i64(ptr undef, undef, i32 undef) %t26 = load , ptr undef - %t27 = call @llvm.vp.load.nv4i64(ptr undef, undef, i32 undef) + %t27 = call @llvm.vp.load.nxv4i64(ptr undef, undef, i32 undef) %t28 = load , ptr undef - %t29 = call @llvm.vp.load.nv8i64(ptr undef, undef, i32 undef) + %t29 = call @llvm.vp.load.nxv8i64(ptr undef, undef, i32 undef) %t30 = load , ptr undef - %t31 = call @llvm.vp.load.nv16i64(ptr undef, undef, i32 undef) + %t31 = call @llvm.vp.load.nxv16i64(ptr undef, undef, i32 undef) %t32 = load , ptr undef ret void } @@ -471,25 +471,167 @@ define void @store() { store <8 x i64> undef, ptr undef call void @llvm.vp.store.v16i64(<16 x i64> undef, ptr undef, <16 x i1> undef, i32 undef) store <16 x i64> undef, ptr undef - call void @llvm.vp.store.nv2i8( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv2i8( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv4i8( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv4i8( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv8i8( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv8i8( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv16i8( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv16i8( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv2i64( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv2i64( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv4i64( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv4i64( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv8i64( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv8i64( undef, ptr undef, undef, i32 undef) store undef, ptr undef - call void @llvm.vp.store.nv16i64( undef, ptr undef, undef, i32 undef) + call void @llvm.vp.store.nxv16i64( undef, ptr undef, undef, i32 undef) store undef, ptr undef ret void } +define void @reduce_add() { +; CHECK-LABEL: 'reduce_add' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call i8 @llvm.reduce.add.v2i8(<2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call i8 @llvm.reduce.add.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call i8 @llvm.reduce.add.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call i8 @llvm.reduce.add.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call i64 @llvm.reduce.add.v2i64(<2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call i64 @llvm.reduce.add.v4i64(<4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call i64 @llvm.reduce.add.v8i64(<8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call i64 @llvm.reduce.add.v16i64(<16 x i64> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call i8 @llvm.reduce.add.nxv2i8( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call i8 @llvm.reduce.add.nxv4i8( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call i8 @llvm.reduce.add.nxv8i8( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call i8 @llvm.reduce.add.nxv16i8( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call i64 @llvm.reduce.add.nxv2i64( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call i64 @llvm.reduce.add.nxv4i64( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call i64 @llvm.reduce.add.nxv8i64( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call i64 @llvm.reduce.add.nxv16i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef) + call i8 @llvm.reduce.add.v2i8(<2 x i8> undef) + call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef) + call i8 @llvm.reduce.add.v4i8(<4 x i8> undef) + call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef) + call i8 @llvm.reduce.add.v8i8(<8 x i8> undef) + call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef) + call i8 @llvm.reduce.add.v16i8(<16 x i8> undef) + call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef) + call i64 @llvm.reduce.add.v2i64(<2 x i64> undef) + call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef) + call i64 @llvm.reduce.add.v4i64(<4 x i64> undef) + call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef) + call i64 @llvm.reduce.add.v8i64(<8 x i64> undef) + call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef) + call i64 @llvm.reduce.add.v16i64(<16 x i64> undef) + call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) + call i8 @llvm.reduce.add.nxv2i8( undef) + call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, undef, undef, i32 undef) + call i8 @llvm.reduce.add.nxv4i8( undef) + call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) + call i8 @llvm.reduce.add.nxv8i8( undef) + call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, undef, undef, i32 undef) + call i8 @llvm.reduce.add.nxv16i8( undef) + call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, undef, undef, i32 undef) + call i64 @llvm.reduce.add.nxv2i64( undef) + call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, undef, undef, i32 undef) + call i64 @llvm.reduce.add.nxv4i64( undef) + call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, undef, undef, i32 undef) + call i64 @llvm.reduce.add.nxv8i64( undef) + call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, undef, undef, i32 undef) + call i64 @llvm.reduce.add.nxv16i64( undef) + ret void +} + +define void @reduce_fadd() { +; CHECK-LABEL: 'reduce_fadd' +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call float @llvm.reduce.fadd.nxv2f32(float undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call float @llvm.reduce.fadd.nxv4f32(float undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call float @llvm.reduce.fadd.nxv8f32(float undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call float @llvm.reduce.fadd.nxv16f32(float undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call double @llvm.reduce.fadd.nxv2f64(double undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call double @llvm.reduce.fadd.nxv4f64(double undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call double @llvm.reduce.fadd.nxv8f64(double undef, undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call double @llvm.reduce.fadd.nxv16f64(double undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) + call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef) + call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) + call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef) + call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) + call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef) + call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) + call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef) + call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) + call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef) + call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) + call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef) + call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) + call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef) + call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) + call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef) + call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) + call float @llvm.reduce.fadd.nxv2f32(float undef, undef) + call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) + call float @llvm.reduce.fadd.nxv4f32(float undef, undef) + call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) + call float @llvm.reduce.fadd.nxv8f32(float undef, undef) + call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) + call float @llvm.reduce.fadd.nxv16f32(float undef, undef) + call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) + call double @llvm.reduce.fadd.nxv2f64(double undef, undef) + call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) + call double @llvm.reduce.fadd.nxv4f64(double undef, undef) + call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) + call double @llvm.reduce.fadd.nxv8f64(double undef, undef) + call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) + call double @llvm.reduce.fadd.nxv16f64(double undef, undef) + ret void +} + declare <2 x i8> @llvm.vp.add.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32) declare <4 x i8> @llvm.vp.add.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) @@ -498,14 +640,14 @@ declare <2 x i64> @llvm.vp.add.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32) declare <4 x i64> @llvm.vp.add.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32) declare <8 x i64> @llvm.vp.add.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32) declare <16 x i64> @llvm.vp.add.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32) -declare @llvm.vp.add.nv2i8(, , , i32) -declare @llvm.vp.add.nv4i8(, , , i32) -declare @llvm.vp.add.nv8i8(, , , i32) -declare @llvm.vp.add.nv16i8(, , , i32) -declare @llvm.vp.add.nv2i64(, , , i32) -declare @llvm.vp.add.nv4i64(, , , i32) -declare @llvm.vp.add.nv8i64(, , , i32) -declare @llvm.vp.add.nv16i64(, , , i32) +declare @llvm.vp.add.nxv2i8(, , , i32) +declare @llvm.vp.add.nxv4i8(, , , i32) +declare @llvm.vp.add.nxv8i8(, , , i32) +declare @llvm.vp.add.nxv16i8(, , , i32) +declare @llvm.vp.add.nxv2i64(, , , i32) +declare @llvm.vp.add.nxv4i64(, , , i32) +declare @llvm.vp.add.nxv8i64(, , , i32) +declare @llvm.vp.add.nxv16i64(, , , i32) declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1, <2 x i1>, i32) declare <4 x i8> @llvm.vp.abs.v4i8(<4 x i8>, i1, <4 x i1>, i32) @@ -515,14 +657,14 @@ declare <2 x i64> @llvm.vp.abs.v2i64(<2 x i64>, i1, <2 x i1>, i32) declare <4 x i64> @llvm.vp.abs.v4i64(<4 x i64>, i1, <4 x i1>, i32) declare <8 x i64> @llvm.vp.abs.v8i64(<8 x i64>, i1, <8 x i1>, i32) declare <16 x i64> @llvm.vp.abs.v16i64(<16 x i64>, i1, <16 x i1>, i32) -declare @llvm.vp.abs.nv2i8(, i1, , i32) -declare @llvm.vp.abs.nv4i8(, i1, , i32) -declare @llvm.vp.abs.nv8i8(, i1, , i32) -declare @llvm.vp.abs.nv16i8(, i1, , i32) -declare @llvm.vp.abs.nv2i64(, i1, , i32) -declare @llvm.vp.abs.nv4i64(, i1, , i32) -declare @llvm.vp.abs.nv8i64(, i1, , i32) -declare @llvm.vp.abs.nv16i64(, i1, , i32) +declare @llvm.vp.abs.nxv2i8(, i1, , i32) +declare @llvm.vp.abs.nxv4i8(, i1, , i32) +declare @llvm.vp.abs.nxv8i8(, i1, , i32) +declare @llvm.vp.abs.nxv16i8(, i1, , i32) +declare @llvm.vp.abs.nxv2i64(, i1, , i32) +declare @llvm.vp.abs.nxv4i64(, i1, , i32) +declare @llvm.vp.abs.nxv8i64(, i1, , i32) +declare @llvm.vp.abs.nxv16i64(, i1, , i32) declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1) declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1) @@ -532,14 +674,14 @@ declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) -declare @llvm.abs.nv2i8(, i1) -declare @llvm.abs.nv4i8(, i1) -declare @llvm.abs.nv8i8(, i1) -declare @llvm.abs.nv16i8(, i1) -declare @llvm.abs.nv2i64(, i1) -declare @llvm.abs.nv4i64(, i1) -declare @llvm.abs.nv8i64(, i1) -declare @llvm.abs.nv16i64(, i1) +declare @llvm.abs.nxv2i8(, i1) +declare @llvm.abs.nxv4i8(, i1) +declare @llvm.abs.nxv8i8(, i1) +declare @llvm.abs.nxv16i8(, i1) +declare @llvm.abs.nxv2i64(, i1) +declare @llvm.abs.nxv4i64(, i1) +declare @llvm.abs.nxv8i64(, i1) +declare @llvm.abs.nxv16i64(, i1) declare <2 x i8> @llvm.vp.load.v2i8(ptr, <2 x i1>, i32) declare <4 x i8> @llvm.vp.load.v4i8(ptr, <4 x i1>, i32) @@ -549,14 +691,14 @@ declare <2 x i64> @llvm.vp.load.v2i64(ptr, <2 x i1>, i32) declare <4 x i64> @llvm.vp.load.v4i64(ptr, <4 x i1>, i32) declare <8 x i64> @llvm.vp.load.v8i64(ptr, <8 x i1>, i32) declare <16 x i64> @llvm.vp.load.v16i64(ptr, <16 x i1>, i32) -declare @llvm.vp.load.nv2i8(ptr, , i32) -declare @llvm.vp.load.nv4i8(ptr, , i32) -declare @llvm.vp.load.nv8i8(ptr, , i32) -declare @llvm.vp.load.nv16i8(ptr, , i32) -declare @llvm.vp.load.nv2i64(ptr, , i32) -declare @llvm.vp.load.nv4i64(ptr, , i32) -declare @llvm.vp.load.nv8i64(ptr, , i32) -declare @llvm.vp.load.nv16i64(ptr, , i32) +declare @llvm.vp.load.nxv2i8(ptr, , i32) +declare @llvm.vp.load.nxv4i8(ptr, , i32) +declare @llvm.vp.load.nxv8i8(ptr, , i32) +declare @llvm.vp.load.nxv16i8(ptr, , i32) +declare @llvm.vp.load.nxv2i64(ptr, , i32) +declare @llvm.vp.load.nxv4i64(ptr, , i32) +declare @llvm.vp.load.nxv8i64(ptr, , i32) +declare @llvm.vp.load.nxv16i64(ptr, , i32) declare void @llvm.vp.store.v2i8(<2 x i8>, ptr, <2 x i1>, i32) declare void @llvm.vp.store.v4i8(<4 x i8>, ptr, <4 x i1>, i32) @@ -566,14 +708,82 @@ declare void @llvm.vp.store.v2i64(<2 x i64>, ptr, <2 x i1>, i32) declare void @llvm.vp.store.v4i64(<4 x i64>, ptr, <4 x i1>, i32) declare void @llvm.vp.store.v8i64(<8 x i64>, ptr, <8 x i1>, i32) declare void @llvm.vp.store.v16i64(<16 x i64>, ptr, <16 x i1>, i32) -declare void @llvm.vp.store.nv2i8(, ptr, , i32) -declare void @llvm.vp.store.nv4i8(, ptr, , i32) -declare void @llvm.vp.store.nv8i8(, ptr, , i32) -declare void @llvm.vp.store.nv16i8(, ptr, , i32) -declare void @llvm.vp.store.nv2i64(, ptr, , i32) -declare void @llvm.vp.store.nv4i64(, ptr, , i32) -declare void @llvm.vp.store.nv8i64(, ptr, , i32) -declare void @llvm.vp.store.nv16i64(, ptr, , i32) +declare void @llvm.vp.store.nxv2i8(, ptr, , i32) +declare void @llvm.vp.store.nxv4i8(, ptr, , i32) +declare void @llvm.vp.store.nxv8i8(, ptr, , i32) +declare void @llvm.vp.store.nxv16i8(, ptr, , i32) +declare void @llvm.vp.store.nxv2i64(, ptr, , i32) +declare void @llvm.vp.store.nxv4i64(, ptr, , i32) +declare void @llvm.vp.store.nxv8i64(, ptr, , i32) +declare void @llvm.vp.store.nxv16i64(, ptr, , i32) + +declare i8 @llvm.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.reduce.add.v16i8(<16 x i8>) +declare i64 @llvm.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.reduce.add.v16i64(<16 x i64>) +declare i8 @llvm.reduce.add.nxv2i8() +declare i8 @llvm.reduce.add.nxv4i8() +declare i8 @llvm.reduce.add.nxv8i8() +declare i8 @llvm.reduce.add.nxv16i8() +declare i64 @llvm.reduce.add.nxv2i64() +declare i64 @llvm.reduce.add.nxv4i64() +declare i64 @llvm.reduce.add.nxv8i64() +declare i64 @llvm.reduce.add.nxv16i64() + +declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32) +declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32) +declare i8 @llvm.vp.reduce.add.v8i8(i8, <8 x i8>, <8 x i1>, i32) +declare i8 @llvm.vp.reduce.add.v16i8(i8, <16 x i8>, <16 x i1>, i32) +declare i64 @llvm.vp.reduce.add.v2i64(i64, <2 x i64>, <2 x i1>, i32) +declare i64 @llvm.vp.reduce.add.v4i64(i64, <4 x i64>, <4 x i1>, i32) +declare i64 @llvm.vp.reduce.add.v8i64(i64, <8 x i64>, <8 x i1>, i32) +declare i64 @llvm.vp.reduce.add.v16i64(i64, <16 x i64>, <16 x i1>, i32) +declare i8 @llvm.vp.reduce.add.nxv2i8(i8, , , i32) +declare i8 @llvm.vp.reduce.add.nxv4i8(i8, , , i32) +declare i8 @llvm.vp.reduce.add.nxv8i8(i8, , , i32) +declare i8 @llvm.vp.reduce.add.nxv16i8(i8, , , i32) +declare i64 @llvm.vp.reduce.add.nxv2i64(i64, , , i32) +declare i64 @llvm.vp.reduce.add.nxv4i64(i64, , , i32) +declare i64 @llvm.vp.reduce.add.nxv8i64(i64, , , i32) +declare i64 @llvm.vp.reduce.add.nxv16i64(i64, , , i32) + +declare float @llvm.reduce.fadd.v2f32(float, <2 x float>) +declare float @llvm.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.reduce.fadd.v8f32(float, <8 x float>) +declare float @llvm.reduce.fadd.v16f32(float, <16 x float>) +declare double @llvm.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.reduce.fadd.v4f64(double, <4 x double>) +declare double @llvm.reduce.fadd.v8f64(double, <8 x double>) +declare double @llvm.reduce.fadd.v16f64(double, <16 x double>) +declare float @llvm.reduce.fadd.nxv2f32(float, ) +declare float @llvm.reduce.fadd.nxv4f32(float, ) +declare float @llvm.reduce.fadd.nxv8f32(float, ) +declare float @llvm.reduce.fadd.nxv16f32(float, ) +declare double @llvm.reduce.fadd.nxv2f64(double, ) +declare double @llvm.reduce.fadd.nxv4f64(double, ) +declare double @llvm.reduce.fadd.nxv8f64(double, ) +declare double @llvm.reduce.fadd.nxv16f64(double, ) + +declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32) +declare float @llvm.vp.reduce.fadd.v16f32(float, <16 x float>, <16 x i1>, i32) +declare double @llvm.vp.reduce.fadd.v2f64(double, <2 x double>, <2 x i1>, i32) +declare double @llvm.vp.reduce.fadd.v4f64(double, <4 x double>, <4 x i1>, i32) +declare double @llvm.vp.reduce.fadd.v8f64(double, <8 x double>, <8 x i1>, i32) +declare double @llvm.vp.reduce.fadd.v16f64(double, <16 x double>, <16 x i1>, i32) +declare float @llvm.vp.reduce.fadd.nxv2f32(float, , , i32) +declare float @llvm.vp.reduce.fadd.nxv4f32(float, , , i32) +declare float @llvm.vp.reduce.fadd.nxv8f32(float, , , i32) +declare float @llvm.vp.reduce.fadd.nxv16f32(float, , , i32) +declare double @llvm.vp.reduce.fadd.nxv2f64(double, , , i32) +declare double @llvm.vp.reduce.fadd.nxv4f64(double, , , i32) +declare double @llvm.vp.reduce.fadd.nxv8f64(double, , , i32) +declare double @llvm.vp.reduce.fadd.nxv16f64(double, , , i32) declare @llvm.fshr.nxv4i32( %a, %b, %c) declare @llvm.fshl.nxv4i32( %a, %b, %c) From 8ae8afcd24360137f66d8c2f57b463cf60636479 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Thu, 19 Oct 2023 09:53:41 -0700 Subject: [PATCH 3/4] fix intrinsic name --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 9 - .../CostModel/RISCV/rvv-intrinsics.ll | 160 +++++++++--------- 2 files changed, 80 insertions(+), 89 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index f87bfa1f5ee28..a90ea0ba32ffb 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1739,15 +1739,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { "types"); ArrayRef NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); - // FIXME: it looks like non-vp reductions are costed using the - // Intrinsic::not_intrinsic opcode in the cost model. In the future, - // they should use the correct intrinsic opcode. The approach for - // costing VPIntrinsics is to cost them as their non-vp counterpart so - // we use Intrinsic::not_intrinsic below, however this must change when - // non-vp reductions use the correct ID. - if (VPReductionIntrinsic::isVPReduction(ICA.getID())) - FID = std::make_optional(Intrinsic::not_intrinsic); - // VPReduction intrinsics have a start value argument that their non-vp // counterparts do not have, except for the fadd and fmul non-vp // counterpart. diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 5707db18cfe92..07ac6e15fdcbb 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -492,108 +492,108 @@ define void @store() { define void @reduce_add() { ; CHECK-LABEL: 'reduce_add' -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call i8 @llvm.reduce.add.v2i8(<2 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call i8 @llvm.reduce.add.v4i8(<4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call i8 @llvm.reduce.add.v8i8(<8 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call i8 @llvm.reduce.add.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call i64 @llvm.reduce.add.v2i64(<2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call i64 @llvm.reduce.add.v4i64(<4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call i64 @llvm.reduce.add.v8i64(<8 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call i64 @llvm.reduce.add.v16i64(<16 x i64> undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call i8 @llvm.reduce.add.nxv2i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call i8 @llvm.reduce.add.nxv4i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call i8 @llvm.reduce.add.nxv8i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call i8 @llvm.reduce.add.nxv16i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call i64 @llvm.reduce.add.nxv2i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call i64 @llvm.reduce.add.nxv4i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call i64 @llvm.reduce.add.nxv8i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call i64 @llvm.reduce.add.nxv16i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %3 = call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %5 = call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %6 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %7 = call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %13 = call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %14 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %15 = call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %17 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %18 = call i8 @llvm.vector.reduce.add.nxv2i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %19 = call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %20 = call i8 @llvm.vector.reduce.add.nxv4i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %21 = call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %22 = call i8 @llvm.vector.reduce.add.nxv8i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %23 = call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %24 = call i8 @llvm.vector.reduce.add.nxv16i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %25 = call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = call i64 @llvm.vector.reduce.add.nxv2i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %27 = call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %28 = call i64 @llvm.vector.reduce.add.nxv4i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %29 = call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %30 = call i64 @llvm.vector.reduce.add.nxv8i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %31 = call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %32 = call i64 @llvm.vector.reduce.add.nxv16i64( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call i8 @llvm.vp.reduce.add.v2i8(i8 undef, <2 x i8> undef, <2 x i1> undef, i32 undef) - call i8 @llvm.reduce.add.v2i8(<2 x i8> undef) + call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) call i8 @llvm.vp.reduce.add.v4i8(i8 undef, <4 x i8> undef, <4 x i1> undef, i32 undef) - call i8 @llvm.reduce.add.v4i8(<4 x i8> undef) + call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) call i8 @llvm.vp.reduce.add.v8i8(i8 undef, <8 x i8> undef, <8 x i1> undef, i32 undef) - call i8 @llvm.reduce.add.v8i8(<8 x i8> undef) + call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) call i8 @llvm.vp.reduce.add.v16i8(i8 undef, <16 x i8> undef, <16 x i1> undef, i32 undef) - call i8 @llvm.reduce.add.v16i8(<16 x i8> undef) + call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) call i64 @llvm.vp.reduce.add.v2i64(i64 undef, <2 x i64> undef, <2 x i1> undef, i32 undef) - call i64 @llvm.reduce.add.v2i64(<2 x i64> undef) + call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) call i64 @llvm.vp.reduce.add.v4i64(i64 undef, <4 x i64> undef, <4 x i1> undef, i32 undef) - call i64 @llvm.reduce.add.v4i64(<4 x i64> undef) + call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) call i64 @llvm.vp.reduce.add.v8i64(i64 undef, <8 x i64> undef, <8 x i1> undef, i32 undef) - call i64 @llvm.reduce.add.v8i64(<8 x i64> undef) + call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) call i64 @llvm.vp.reduce.add.v16i64(i64 undef, <16 x i64> undef, <16 x i1> undef, i32 undef) - call i64 @llvm.reduce.add.v16i64(<16 x i64> undef) + call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) - call i8 @llvm.reduce.add.nxv2i8( undef) + call i8 @llvm.vector.reduce.add.nxv2i8( undef) call i8 @llvm.vp.reduce.add.nxv4i8(i8 undef, undef, undef, i32 undef) - call i8 @llvm.reduce.add.nxv4i8( undef) + call i8 @llvm.vector.reduce.add.nxv4i8( undef) call i8 @llvm.vp.reduce.add.nxv8i8(i8 undef, undef, undef, i32 undef) - call i8 @llvm.reduce.add.nxv8i8( undef) + call i8 @llvm.vector.reduce.add.nxv8i8( undef) call i8 @llvm.vp.reduce.add.nxv16i8(i8 undef, undef, undef, i32 undef) - call i8 @llvm.reduce.add.nxv16i8( undef) + call i8 @llvm.vector.reduce.add.nxv16i8( undef) call i64 @llvm.vp.reduce.add.nxv2i64(i64 undef, undef, undef, i32 undef) - call i64 @llvm.reduce.add.nxv2i64( undef) + call i64 @llvm.vector.reduce.add.nxv2i64( undef) call i64 @llvm.vp.reduce.add.nxv4i64(i64 undef, undef, undef, i32 undef) - call i64 @llvm.reduce.add.nxv4i64( undef) + call i64 @llvm.vector.reduce.add.nxv4i64( undef) call i64 @llvm.vp.reduce.add.nxv8i64(i64 undef, undef, undef, i32 undef) - call i64 @llvm.reduce.add.nxv8i64( undef) + call i64 @llvm.vector.reduce.add.nxv8i64( undef) call i64 @llvm.vp.reduce.add.nxv16i64(i64 undef, undef, undef, i32 undef) - call i64 @llvm.reduce.add.nxv16i64( undef) + call i64 @llvm.vector.reduce.add.nxv16i64( undef) ret void } define void @reduce_fadd() { ; CHECK-LABEL: 'reduce_fadd' -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call float @llvm.reduce.fadd.nxv2f32(float undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call float @llvm.reduce.fadd.nxv4f32(float undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call float @llvm.reduce.fadd.nxv8f32(float undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call float @llvm.reduce.fadd.nxv16f32(float undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call double @llvm.reduce.fadd.nxv2f64(double undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call double @llvm.reduce.fadd.nxv4f64(double undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call double @llvm.reduce.fadd.nxv8f64(double undef, undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) ; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call double @llvm.reduce.fadd.nxv16f64(double undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -717,22 +717,22 @@ declare void @llvm.vp.store.nxv4i64(, ptr, , declare void @llvm.vp.store.nxv8i64(, ptr, , i32) declare void @llvm.vp.store.nxv16i64(, ptr, , i32) -declare i8 @llvm.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.reduce.add.v16i8(<16 x i8>) -declare i64 @llvm.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.reduce.add.v16i64(<16 x i64>) -declare i8 @llvm.reduce.add.nxv2i8() -declare i8 @llvm.reduce.add.nxv4i8() -declare i8 @llvm.reduce.add.nxv8i8() -declare i8 @llvm.reduce.add.nxv16i8() -declare i64 @llvm.reduce.add.nxv2i64() -declare i64 @llvm.reduce.add.nxv4i64() -declare i64 @llvm.reduce.add.nxv8i64() -declare i64 @llvm.reduce.add.nxv16i64() +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i8 @llvm.vector.reduce.add.nxv2i8() +declare i8 @llvm.vector.reduce.add.nxv4i8() +declare i8 @llvm.vector.reduce.add.nxv8i8() +declare i8 @llvm.vector.reduce.add.nxv16i8() +declare i64 @llvm.vector.reduce.add.nxv2i64() +declare i64 @llvm.vector.reduce.add.nxv4i64() +declare i64 @llvm.vector.reduce.add.nxv8i64() +declare i64 @llvm.vector.reduce.add.nxv16i64() declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32) declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32) From 2c8cdc67c9cfd668d4fda32f805cf4a1f7646cf8 Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Thu, 19 Oct 2023 17:31:16 -0700 Subject: [PATCH 4/4] vector.reduce.fadd --- .../CostModel/RISCV/rvv-intrinsics.ll | 96 +++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll index 07ac6e15fdcbb..43a03404e8db6 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll @@ -564,71 +564,71 @@ define void @reduce_add() { define void @reduce_fadd() { ; CHECK-LABEL: 'reduce_fadd' ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %2 = call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %3 = call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %4 = call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %4 = call float @llvm.vector.reduce.fadd.v4f32(float undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %5 = call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %6 = call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %6 = call float @llvm.vector.reduce.fadd.v8f32(float undef, <8 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %7 = call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %8 = call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %8 = call float @llvm.vector.reduce.fadd.v16f32(float undef, <16 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %10 = call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = call double @llvm.vector.reduce.fadd.v2f64(double undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %11 = call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %12 = call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %12 = call double @llvm.vector.reduce.fadd.v4f64(double undef, <4 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %13 = call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %14 = call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %14 = call double @llvm.vector.reduce.fadd.v8f64(double undef, <8 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %15 = call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %16 = call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %16 = call double @llvm.vector.reduce.fadd.v16f64(double undef, <16 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %17 = call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %18 = call float @llvm.reduce.fadd.nxv2f32(float undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %18 = call float @llvm.vector.reduce.fadd.nxv2f32(float undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %19 = call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %20 = call float @llvm.reduce.fadd.nxv4f32(float undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %20 = call float @llvm.vector.reduce.fadd.nxv4f32(float undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %21 = call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %22 = call float @llvm.reduce.fadd.nxv8f32(float undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %22 = call float @llvm.vector.reduce.fadd.nxv8f32(float undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %23 = call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %24 = call float @llvm.reduce.fadd.nxv16f32(float undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv16f32(float undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %25 = call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %26 = call double @llvm.reduce.fadd.nxv2f64(double undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %26 = call double @llvm.vector.reduce.fadd.nxv2f64(double undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %27 = call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %28 = call double @llvm.reduce.fadd.nxv4f64(double undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %28 = call double @llvm.vector.reduce.fadd.nxv4f64(double undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %29 = call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %30 = call double @llvm.reduce.fadd.nxv8f64(double undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %30 = call double @llvm.vector.reduce.fadd.nxv8f64(double undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %31 = call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %32 = call double @llvm.reduce.fadd.nxv16f64(double undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %32 = call double @llvm.vector.reduce.fadd.nxv16f64(double undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call float @llvm.vp.reduce.fadd.v2f32(float undef, <2 x float> undef, <2 x i1> undef, i32 undef) - call float @llvm.reduce.fadd.v2f32(float undef, <2 x float> undef) + call float @llvm.vector.reduce.fadd.v2f32(float undef, <2 x float> undef) call float @llvm.vp.reduce.fadd.v4f32(float undef, <4 x float> undef, <4 x i1> undef, i32 undef) - call float @llvm.reduce.fadd.v4f32(float undef, <4 x float> undef) + call float @llvm.vector.reduce.fadd.v4f32(float undef, <4 x float> undef) call float @llvm.vp.reduce.fadd.v8f32(float undef, <8 x float> undef, <8 x i1> undef, i32 undef) - call float @llvm.reduce.fadd.v8f32(float undef, <8 x float> undef) + call float @llvm.vector.reduce.fadd.v8f32(float undef, <8 x float> undef) call float @llvm.vp.reduce.fadd.v16f32(float undef, <16 x float> undef, <16 x i1> undef, i32 undef) - call float @llvm.reduce.fadd.v16f32(float undef, <16 x float> undef) + call float @llvm.vector.reduce.fadd.v16f32(float undef, <16 x float> undef) call double @llvm.vp.reduce.fadd.v2f64(double undef, <2 x double> undef, <2 x i1> undef, i32 undef) - call double @llvm.reduce.fadd.v2f64(double undef, <2 x double> undef) + call double @llvm.vector.reduce.fadd.v2f64(double undef, <2 x double> undef) call double @llvm.vp.reduce.fadd.v4f64(double undef, <4 x double> undef, <4 x i1> undef, i32 undef) - call double @llvm.reduce.fadd.v4f64(double undef, <4 x double> undef) + call double @llvm.vector.reduce.fadd.v4f64(double undef, <4 x double> undef) call double @llvm.vp.reduce.fadd.v8f64(double undef, <8 x double> undef, <8 x i1> undef, i32 undef) - call double @llvm.reduce.fadd.v8f64(double undef, <8 x double> undef) + call double @llvm.vector.reduce.fadd.v8f64(double undef, <8 x double> undef) call double @llvm.vp.reduce.fadd.v16f64(double undef, <16 x double> undef, <16 x i1> undef, i32 undef) - call double @llvm.reduce.fadd.v16f64(double undef, <16 x double> undef) + call double @llvm.vector.reduce.fadd.v16f64(double undef, <16 x double> undef) call float @llvm.vp.reduce.fadd.nxv2f32(float undef, undef, undef, i32 undef) - call float @llvm.reduce.fadd.nxv2f32(float undef, undef) + call float @llvm.vector.reduce.fadd.nxv2f32(float undef, undef) call float @llvm.vp.reduce.fadd.nxv4f32(float undef, undef, undef, i32 undef) - call float @llvm.reduce.fadd.nxv4f32(float undef, undef) + call float @llvm.vector.reduce.fadd.nxv4f32(float undef, undef) call float @llvm.vp.reduce.fadd.nxv8f32(float undef, undef, undef, i32 undef) - call float @llvm.reduce.fadd.nxv8f32(float undef, undef) + call float @llvm.vector.reduce.fadd.nxv8f32(float undef, undef) call float @llvm.vp.reduce.fadd.nxv16f32(float undef, undef, undef, i32 undef) - call float @llvm.reduce.fadd.nxv16f32(float undef, undef) + call float @llvm.vector.reduce.fadd.nxv16f32(float undef, undef) call double @llvm.vp.reduce.fadd.nxv2f64(double undef, undef, undef, i32 undef) - call double @llvm.reduce.fadd.nxv2f64(double undef, undef) + call double @llvm.vector.reduce.fadd.nxv2f64(double undef, undef) call double @llvm.vp.reduce.fadd.nxv4f64(double undef, undef, undef, i32 undef) - call double @llvm.reduce.fadd.nxv4f64(double undef, undef) + call double @llvm.vector.reduce.fadd.nxv4f64(double undef, undef) call double @llvm.vp.reduce.fadd.nxv8f64(double undef, undef, undef, i32 undef) - call double @llvm.reduce.fadd.nxv8f64(double undef, undef) + call double @llvm.vector.reduce.fadd.nxv8f64(double undef, undef) call double @llvm.vp.reduce.fadd.nxv16f64(double undef, undef, undef, i32 undef) - call double @llvm.reduce.fadd.nxv16f64(double undef, undef) + call double @llvm.vector.reduce.fadd.nxv16f64(double undef, undef) ret void } @@ -751,22 +751,22 @@ declare i64 @llvm.vp.reduce.add.nxv4i64(i64, , , , i32) declare i64 @llvm.vp.reduce.add.nxv16i64(i64, , , i32) -declare float @llvm.reduce.fadd.v2f32(float, <2 x float>) -declare float @llvm.reduce.fadd.v4f32(float, <4 x float>) -declare float @llvm.reduce.fadd.v8f32(float, <8 x float>) -declare float @llvm.reduce.fadd.v16f32(float, <16 x float>) -declare double @llvm.reduce.fadd.v2f64(double, <2 x double>) -declare double @llvm.reduce.fadd.v4f64(double, <4 x double>) -declare double @llvm.reduce.fadd.v8f64(double, <8 x double>) -declare double @llvm.reduce.fadd.v16f64(double, <16 x double>) -declare float @llvm.reduce.fadd.nxv2f32(float, ) -declare float @llvm.reduce.fadd.nxv4f32(float, ) -declare float @llvm.reduce.fadd.nxv8f32(float, ) -declare float @llvm.reduce.fadd.nxv16f32(float, ) -declare double @llvm.reduce.fadd.nxv2f64(double, ) -declare double @llvm.reduce.fadd.nxv4f64(double, ) -declare double @llvm.reduce.fadd.nxv8f64(double, ) -declare double @llvm.reduce.fadd.nxv16f64(double, ) +declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) +declare float @llvm.vector.reduce.fadd.nxv2f32(float, ) +declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) +declare float @llvm.vector.reduce.fadd.nxv8f32(float, ) +declare float @llvm.vector.reduce.fadd.nxv16f32(float, ) +declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) +declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) +declare double @llvm.vector.reduce.fadd.nxv8f64(double, ) +declare double @llvm.vector.reduce.fadd.nxv16f64(double, ) declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32) declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32)