diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 7787f58683b2a4..a6b5235235ff3b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -525,6 +525,12 @@ class LoopVectorizationPlanner { bool isMoreProfitable(const VectorizationFactor &A, const VectorizationFactor &B) const; + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B in the context of vectorizing a loop with known \p MaxTripCount. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B, + const unsigned MaxTripCount) const; + /// Determines if we have the infrastructure to vectorize the loop and its /// epilogue, assuming the main loop is vectorized by \p VF. bool isCandidateForEpilogueVectorization(const ElementCount VF) const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index aa900038e40aea..1d9e4f5a19f5ce 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1516,7 +1516,10 @@ class LoopVectorizationCostModel { /// Returns true if epilogue vectorization is considered profitable, and /// false otherwise. /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// \p Multiplier is an aditional scaling factor applied to VF before + /// comparing to EpilogueVectorizationMinVF. + bool isEpilogueVectorizationProfitable(const ElementCount VF, + const unsigned Multiplier) const; /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -4289,12 +4292,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { } bool LoopVectorizationPlanner::isMoreProfitable( - const VectorizationFactor &A, const VectorizationFactor &B) const { + const VectorizationFactor &A, const VectorizationFactor &B, + const unsigned MaxTripCount) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); - // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); @@ -4343,6 +4345,12 @@ bool LoopVectorizationPlanner::isMoreProfitable( return CmpFn(RTCostA, RTCostB); } +bool LoopVectorizationPlanner::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); + return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); +} + void LoopVectorizationPlanner::emitInvalidCostRemarks( OptimizationRemarkEmitter *ORE) { using RecipeVFPair = std::pair; @@ -4661,7 +4669,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( } bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( - const ElementCount VF) const { + const ElementCount VF, const unsigned Multiplier) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops @@ -4676,9 +4684,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - unsigned Multiplier = 1; - if (VF.isScalable()) - Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; @@ -4724,7 +4729,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { + unsigned Multiplier = IC; + if (MainLoopVF.isScalable()) + Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); + + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -4743,16 +4752,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; + unsigned MaxTripCount = 0; for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) continue; - // Skip candidate VFs with widths >= the estimate runtime VF (scalable - // vectors) or the VF of the main loop (fixed vectors). + // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable + // vectors) or > the VF of the main loop (fixed vectors). if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) + (NextVF.Width.isScalable() && + ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || + (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && + ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) continue; // If NextVF is greater than the number of remaining iterations, the @@ -4766,6 +4779,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( "Trip count SCEV must be computable"); RemainingIterations = SE.getURemExpr( TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, + SE.getConstant(TCType, MaxTripCount))) { + MaxTripCount = + SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); + } + LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " + << MaxTripCount << "\n"); } if (SE.isKnownPredicate( CmpInst::ICMP_UGT, @@ -4774,7 +4795,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( continue; } - if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) + if (Result.Width.isScalar() || + isMoreProfitable(NextVF, Result, MaxTripCount)) Result = NextVF; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index 2ba460255c607d..3e48abc1af18b4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -16,7 +16,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: br i1 [[CMP_28]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16 @@ -50,33 +50,33 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC5:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[N_VEC5:%.*]] = and i64 [[TMP0]], 4294967292 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX6]] -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]] -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i16> -; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD7]] to <8 x i16> -; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <8 x i16> [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = lshr <8 x i16> [[TMP18]], splat (i16 8) -; CHECK-NEXT: [[TMP20:%.*]] = trunc nuw <8 x i16> [[TMP19]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP20]], ptr [[TMP15]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw <4 x i16> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr <4 x i16> [[TMP18]], splat (i16 8) +; CHECK-NEXT: [[TMP20:%.*]] = trunc nuw <4 x i16> [[TMP19]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP20]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX6]] -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <8 x i16> [[TMP22]], [[TMP17]] -; CHECK-NEXT: [[TMP24:%.*]] = lshr <8 x i16> [[TMP23]], splat (i16 8) -; CHECK-NEXT: [[TMP25:%.*]] = trunc nuw <8 x i16> [[TMP24]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP25]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], 8 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext <4 x i8> [[WIDE_LOAD9]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw <4 x i16> [[TMP22]], [[TMP17]] +; CHECK-NEXT: [[TMP24:%.*]] = lshr <4 x i16> [[TMP23]], splat (i16 8) +; CHECK-NEXT: [[TMP25:%.*]] = trunc nuw <4 x i16> [[TMP24]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP25]], ptr [[TMP21]], align 1 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], 4 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll new file mode 100644 index 00000000000000..4a2de58938043e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll @@ -0,0 +1,359 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -passes=loop-vectorize -force-vector-interleave=4 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i8( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 48 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 48 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP14:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 16 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 48 +; CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[TMP16]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[TMP18]], align 1 +; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[TMP19]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 8 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = add <8 x i8> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0 +; CHECK-NEXT: store <8 x i8> [[TMP26]], ptr [[TMP28]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP31]], [[TMP30]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %B, i64 %iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %C, i64 %iv + %1 = load i8, ptr %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, ptr %A, i64 %iv + store i8 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i16( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 24 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 16 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 24 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i16> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i16> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP13:%.*]] = add <8 x i16> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP14:%.*]] = add <8 x i16> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 16 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 24 +; CHECK-NEXT: store <8 x i16> [[TMP11]], ptr [[TMP16]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP12]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[TMP18]], align 1 +; CHECK-NEXT: store <8 x i16> [[TMP14]], ptr [[TMP19]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i16>, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i16>, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i16> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[TMP27]], i32 0 +; CHECK-NEXT: store <4 x i16> [[TMP26]], ptr [[TMP28]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i16 [[TMP31]], [[TMP30]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, ptr %B, i64 %iv + %0 = load i16, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i16, ptr %C, i64 %iv + %1 = load i16, ptr %arrayidx2, align 1 + %add = add i16 %1, %0 + %arrayidx6 = getelementptr inbounds i16, ptr %A, i64 %iv + store i16 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, i64 noundef %Iterations) { +; CHECK-LABEL: @add_i32( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ITERATIONS:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[ITERATIONS]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ITERATIONS]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 12 +; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP16]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP17]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP18]], align 1 +; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP19]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[ITERATIONS]], 4 +; CHECK-NEXT: [[N_VEC10:%.*]] = sub i64 [[ITERATIONS]], [[N_MOD_VF9]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX11]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[WIDE_LOAD13]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP28]], align 1 +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP31]], [[TMP30]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERATIONS]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %iv + %0 = load i32, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %iv + %1 = load i32, ptr %arrayidx2, align 1 + %add = add i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %iv + store i32 %add, ptr %arrayidx6, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %Iterations + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 547e7afbefcafe..d5110c3fdd70ff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -67,7 +67,7 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) { ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[START]], 0 ; CHECK-NEXT: br i1 [[C]], label %[[EXIT:.*]], label %[[ITER_CHECK:.*]] ; CHECK: [[ITER_CHECK]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[START]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[START]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[START]], 32 @@ -94,11 +94,11 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) { ; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[START]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[START]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[START]], [[N_MOD_VF2]] ; CHECK-NEXT: [[IND_END1:%.*]] = sub i64 [[START]], [[N_VEC3]] ; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC3]] @@ -108,8 +108,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 8f40c31382a860..f7f36d011535f4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -218,13 +218,15 @@ exit: define void @wide_truncated_iv(ptr %dst) { ; CHECK-LABEL: define void @wide_truncated_iv( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i8> [[VEC_IND]], splat (i8 8) ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] @@ -237,18 +239,41 @@ define void @wide_truncated_iv(ptr %dst) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 192 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 192, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[TMP7]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <8 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX3]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <8 x i8> [[VEC_IND4]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX3]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <8 x i8> [[VEC_IND4]], splat (i8 8) +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 200 +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 200, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i8 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i8 [[TRUNC_IV]], ptr [[GEP]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[IV]], 200 -; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -302,7 +327,7 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 @@ -328,7 +353,7 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_2_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[P_LCSSA:%.*]] = phi i64 [ [[P]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[P_LCSSA]] @@ -388,7 +413,7 @@ define void @zext_iv_increment(ptr %dst, i64 %N) { ; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -404,7 +429,7 @@ define void @zext_iv_increment(ptr %dst, i64 %N) { ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[IV_NEXT_EXT]] = zext i32 [[IV_NEXT]] to i64 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV_NEXT_EXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -435,9 +460,10 @@ exit: ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index e768ddd58362b4..bf64dccdb26676 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -10,9 +10,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-LABEL: @interleave_integer_reduction( -; INTERLEAVE-4-NEXT: entry: -; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16 -; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-4-NEXT: iter.check: +; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; INTERLEAVE-4: vector.main.loop.iter.check: +; INTERLEAVE-4-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16 +; INTERLEAVE-4-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE-4: vector.ph: ; INTERLEAVE-4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 ; INTERLEAVE-4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -20,9 +23,9 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4: vector.body: ; INTERLEAVE-4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; INTERLEAVE-4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] ; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 @@ -30,38 +33,64 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 ; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 ; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 ; INTERLEAVE-4-NEXT: [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]] -; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]] -; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]] +; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]] +; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]] +; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] ; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE-4: middle.block: ; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]] -; INTERLEAVE-4-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]] -; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX7]] -; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]] +; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX8]] +; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]]) ; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; INTERLEAVE-4: scalar.ph: -; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; INTERLEAVE-4: vec.epilog.iter.check: +; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; INTERLEAVE-4: vec.epilog.ph: +; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 4 +; INTERLEAVE-4-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]] +; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 +; INTERLEAVE-4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; INTERLEAVE-4: vec.epilog.vector.body: +; INTERLEAVE-4-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX12]], 0 +; INTERLEAVE-4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP19]] +; INTERLEAVE-4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP21]], align 1 +; INTERLEAVE-4-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]] +; INTERLEAVE-4-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4 +; INTERLEAVE-4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]] +; INTERLEAVE-4-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-4: vec.epilog.middle.block: +; INTERLEAVE-4-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP22]]) +; INTERLEAVE-4-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC11]] +; INTERLEAVE-4-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; INTERLEAVE-4: vec.epilog.scalar.ph: +; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ] ; INTERLEAVE-4-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-4: loop: -; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; INTERLEAVE-4-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-4-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-4-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] ; INTERLEAVE-4-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 ; INTERLEAVE-4-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[L]] ; INTERLEAVE-4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE-4: exit: -; INTERLEAVE-4-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-4-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; INTERLEAVE-4-NEXT: ret i32 [[RED_NEXT_LCSSA]] ; ; INTERLEAVE-2-LABEL: @interleave_integer_reduction( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index c9990487665b98..e366deb3ff7774 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -16,59 +16,93 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-LABEL: @saddsat( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] -; CHECK: while.body.preheader: +; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967280 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX6]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD9]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 16 -; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[NEXT_GEP7]], align 2 -; CHECK-NEXT: store <8 x i16> [[TMP5]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD4]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 +; CHECK-NEXT: store <8 x i16> [[TMP2]], ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[PDST]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP7]] +; CHECK-NEXT: [[DOTCAST7:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST7]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC6]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[N_VEC6]], 1 +; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[N_VEC6]], 1 +; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP9]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT21]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX16:%.*]] = shl i64 [[INDEX15]], 1 +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX16]] +; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = shl i64 [[INDEX15]], 1 +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX18]] +; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i16>, ptr [[NEXT_GEP17]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[WIDE_LOAD20]], <4 x i16> [[BROADCAST_SPLAT22]]) +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[NEXT_GEP19]], align 2 +; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[INDEX15]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N24:%.*]] = icmp eq i64 [[N_VEC6]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N24]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi ptr [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL11]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP8]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2 -; CHECK-NEXT: store i16 [[TMP9]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP13]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; @@ -120,61 +154,61 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 -; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD3]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP2]], align 2 ; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]] +; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST6:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC7]] to i32 +; CHECK-NEXT: [[N_VEC5:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC5]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC7]] -; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC7]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END8:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC5]] +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC5]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT18]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX16]] -; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX16]] -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, ptr [[NEXT_GEP17]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) -; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP18]], align 2 -; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX16]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC7]] -; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX14]] +; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX14]] +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i8>, ptr [[NEXT_GEP15]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD17]], <8 x i8> [[BROADCAST_SPLAT19]]) +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP16]], align 2 +; CHECK-NEXT: [[INDEX_NEXT20]] = add nuw i64 [[INDEX14]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N_VEC7]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N23]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N21:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N21]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi ptr [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL10]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP8]], i8 [[OFFSET]]) @@ -182,7 +216,7 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: store i8 [[TMP9]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 292d8e2077d202..2a365c91cab54d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -12,7 +12,7 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -39,11 +39,11 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: @@ -51,12 +51,12 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i8> [[WIDE_LOAD5]], splat (i8 2) +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i8> [[WIDE_LOAD5]], splat (i8 2) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[TMP13]], align 1 -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 8 +; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -115,7 +115,7 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -142,11 +142,11 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: @@ -154,12 +154,12 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <8 x i8> [[WIDE_LOAD5]], splat (i8 2) +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw <4 x i8> [[WIDE_LOAD5]], splat (i8 2) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[TMP13]], align 1 -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 8 +; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -285,7 +285,7 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP8]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -313,11 +313,11 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: @@ -325,13 +325,13 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX4]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD5]] to <8 x i16> -; CHECK-NEXT: [[TMP13:%.*]] = add <8 x i16> [[TMP12]], splat (i16 2) +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i16> [[TMP12]], splat (i16 2) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[TMP15]], align 2 -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 8 +; CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[TMP15]], align 2 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -462,7 +462,7 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[ARG2]] to i32 ; CHECK-NEXT: [[CONV13:%.*]] = zext i8 [[ARG1]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -502,37 +502,37 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT6]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT7]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT9]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX10]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shl <8 x i8> [[WIDE_LOAD11]], splat (i8 4) -; CHECK-NEXT: [[TMP23:%.*]] = add <8 x i8> [[TMP22]], splat (i8 32) -; CHECK-NEXT: [[TMP24:%.*]] = or <8 x i8> [[WIDE_LOAD11]], splat (i8 51) -; CHECK-NEXT: [[TMP25:%.*]] = mul <8 x i8> [[TMP24]], splat (i8 60) -; CHECK-NEXT: [[TMP26:%.*]] = and <8 x i8> [[TMP23]], [[TMP17]] -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP25]], splat (i8 -4) -; CHECK-NEXT: [[TMP28:%.*]] = xor <8 x i8> [[TMP27]], [[TMP18]] -; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], [[TMP26]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP21]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shl <4 x i8> [[WIDE_LOAD11]], splat (i8 4) +; CHECK-NEXT: [[TMP23:%.*]] = add <4 x i8> [[TMP22]], splat (i8 32) +; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i8> [[WIDE_LOAD11]], splat (i8 51) +; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i8> [[TMP24]], splat (i8 60) +; CHECK-NEXT: [[TMP26:%.*]] = and <4 x i8> [[TMP23]], [[TMP17]] +; CHECK-NEXT: [[TMP27:%.*]] = and <4 x i8> [[TMP25]], splat (i8 -4) +; CHECK-NEXT: [[TMP28:%.*]] = xor <4 x i8> [[TMP27]], [[TMP18]] +; CHECK-NEXT: [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], [[TMP26]] ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP29]], ptr [[TMP31]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 8 +; CHECK-NEXT: store <4 x i8> [[TMP29]], ptr [[TMP31]], align 1 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4 ; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -610,7 +610,7 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[ARG2]] to i32 ; CHECK-NEXT: [[CONV13:%.*]] = zext i8 [[ARG1]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LEN]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 @@ -652,39 +652,39 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV13]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT6]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT7]] to <8 x i8> -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[CONV11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT9]] to <8 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[CONV13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT7]] to <4 x i8> +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV11]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT9]] to <4 x i8> ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX10]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2 -; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[WIDE_LOAD11]] to <8 x i8> -; CHECK-NEXT: [[TMP25:%.*]] = shl <8 x i8> [[TMP24]], splat (i8 4) -; CHECK-NEXT: [[TMP26:%.*]] = add <8 x i8> [[TMP25]], splat (i8 32) -; CHECK-NEXT: [[TMP27:%.*]] = and <8 x i8> [[TMP24]], splat (i8 -52) -; CHECK-NEXT: [[TMP28:%.*]] = or <8 x i8> [[TMP27]], splat (i8 51) -; CHECK-NEXT: [[TMP29:%.*]] = mul <8 x i8> [[TMP28]], splat (i8 60) -; CHECK-NEXT: [[TMP30:%.*]] = and <8 x i8> [[TMP26]], [[TMP19]] -; CHECK-NEXT: [[TMP31:%.*]] = and <8 x i8> [[TMP29]], splat (i8 -4) -; CHECK-NEXT: [[TMP32:%.*]] = xor <8 x i8> [[TMP31]], [[TMP20]] -; CHECK-NEXT: [[TMP33:%.*]] = mul <8 x i8> [[TMP32]], [[TMP30]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i16>, ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i16> [[WIDE_LOAD11]] to <4 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = shl <4 x i8> [[TMP24]], splat (i8 4) +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i8> [[TMP25]], splat (i8 32) +; CHECK-NEXT: [[TMP27:%.*]] = and <4 x i8> [[TMP24]], splat (i8 -52) +; CHECK-NEXT: [[TMP28:%.*]] = or <4 x i8> [[TMP27]], splat (i8 51) +; CHECK-NEXT: [[TMP29:%.*]] = mul <4 x i8> [[TMP28]], splat (i8 60) +; CHECK-NEXT: [[TMP30:%.*]] = and <4 x i8> [[TMP26]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i8> [[TMP29]], splat (i8 -4) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i8> [[TMP31]], [[TMP20]] +; CHECK-NEXT: [[TMP33:%.*]] = mul <4 x i8> [[TMP32]], [[TMP30]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0 -; CHECK-NEXT: store <8 x i8> [[TMP33]], ptr [[TMP35]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 8 +; CHECK-NEXT: store <4 x i8> [[TMP33]], ptr [[TMP35]], align 1 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: vec.epilog.middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll index 55fd3530806d25..0add22fb588b88 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll @@ -7,7 +7,7 @@ target triple = "powerpc64le-unknown-linux-gnu" define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-LABEL: define i1 @select_exit_cond( ; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ITER_CHECK:.*]]: ; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 ; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[N]] @@ -15,12 +15,14 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[UMAX]], [[START2]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 [[TMP1]]) ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -96,24 +98,61 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[TMP44]], [[TMP43]] -; CHECK-NEXT: [[BIN_RDX32:%.*]] = or <2 x i64> [[TMP45]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX33:%.*]] = or <2 x i64> [[TMP46]], [[BIN_RDX32]] -; CHECK-NEXT: [[BIN_RDX34:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX33]] -; CHECK-NEXT: [[BIN_RDX35:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX34]] -; CHECK-NEXT: [[BIN_RDX36:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX35]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX36]] +; CHECK-NEXT: [[BIN_RDX18:%.*]] = or <2 x i64> [[TMP45]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX19:%.*]] = or <2 x i64> [[TMP46]], [[BIN_RDX18]] +; CHECK-NEXT: [[BIN_RDX20:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX19]] +; CHECK-NEXT: [[BIN_RDX21:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX20]] +; CHECK-NEXT: [[BIN_RDX22:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX21]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]] ; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX37]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND30:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT31:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <2 x i64> [ [[TMP55]], %[[VEC_EPILOG_PH]] ], [ [[TMP56:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX29]], 0 +; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[NEXT_GEP33]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x i8>, ptr [[TMP58]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[WIDE_LOAD34]] to <2 x i64> +; CHECK-NEXT: [[TMP60:%.*]] = shl <2 x i64> [[VEC_IND30]], splat (i64 1) +; CHECK-NEXT: [[TMP61:%.*]] = shl <2 x i64> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP56]] = or <2 x i64> [[TMP61]], [[VEC_PHI32]] +; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT31]] = add <2 x i64> [[VEC_IND30]], splat (i64 2) +; CHECK-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC25]] +; CHECK-NEXT: br i1 [[TMP62]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP54:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP56]]) +; CHECK-NEXT: [[CMP_N36:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]] +; CHECK-NEXT: br i1 [[CMP_N36]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL28:%.*]] = phi ptr [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX37:%.*]] = phi i64 [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX37]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL28]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP53]] to i64 ; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[IV]], 1 @@ -124,9 +163,9 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[CMP_I166_I:%.*]] = icmp ult ptr [[PTR_IV]], [[END]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[IV]], [[N]] ; CHECK-NEXT: [[AND:%.*]] = select i1 [[CMP_I166_I]], i1 [[CMP2]], i1 false -; CHECK-NEXT: br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[AND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP52]], %[[MIDDLE_BLOCK]] ], [ [[TMP54]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RES:%.*]] = icmp eq i64 [[RED_NEXT_LCSSA]], 0 ; CHECK-NEXT: ret i1 [[RES]] ; @@ -157,5 +196,6 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll index 786197bfdb90a7..5ccbd199e8fe22 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll @@ -4,7 +4,7 @@ ; the future we need to replace this with a more meaningful test of the ; epilogue vectorization cost-model. ; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-minimum-VF=4 -S | FileCheck %s --check-prefix=CHECK-MIN-4 -; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -S | FileCheck %s --check-prefix=CHECK-MIN-D +; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -force-vector-interleave=1 -S | FileCheck %s --check-prefix=CHECK-MIN-IC-1 target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux-gnu" @@ -85,14 +85,14 @@ for.end: ; preds = %for.end.loopexit, % ret void } -; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16. -; CHECK-MIN-D-LABEL: @f3 -; CHECK-MIN-D-NOT: vector.main.loop.iter.check -; CHECK-MIN-D-NOT: vec.epilog.iter.check -; CHECK-MIN-D-NOT: vec.epilog.ph -; CHECK-MIN-D-NOT: vec.epilog.vector.body -; CHECK-MIN-D-NOT: vec.epilog.middle.block -; CHECK-MIN-D: ret void +; Do not vectorize the epilogue for loops with VF*IC less than the default -epilogue-vectorization-minimum-VF of 16. +; CHECK-MIN-IC-1-LABEL: @f3 +; CHECK-MIN-IC-1-NOT: vector.main.loop.iter.check +; CHECK-MIN-IC-1-NOT: vec.epilog.iter.check +; CHECK-MIN-IC-1-NOT: vec.epilog.ph +; CHECK-MIN-IC-1-NOT: vec.epilog.vector.body +; CHECK-MIN-IC-1-NOT: vec.epilog.middle.block +; CHECK-MIN-IC-1: ret void ; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and ; make sure the epilogue gets vectorized in that case. @@ -104,6 +104,15 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK-MIN-4: vec.epilog.middle.block ; CHECK-MIN-4: ret void +; Default behaviour is to vectorize the epilogue for this loop. +; CHECK-LABEL: @f3 +; CHECK: vector.main.loop.iter.check +; CHECK: vec.epilog.iter.check +; CHECK: vec.epilog.ph +; CHECK: vec.epilog.vector.body +; CHECK: vec.epilog.middle.block +; CHECK: ret void + define dso_local void @f3(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 signext %N) { entry: %cmp1 = icmp sgt i32 %N, 0 @@ -134,4 +143,4 @@ for.end: ; preds = %for.end.loopexit, % } attributes #0 = { minsize } -attributes #1 = { optsize } \ No newline at end of file +attributes #1 = { optsize } diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 3363aea39e73f6..a1cc59909adbe6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -11,7 +11,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: iter.check: ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -3 ; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], 32 @@ -39,34 +39,34 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 16 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 16 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]] ; CHECK-NEXT: [[IND_END4:%.*]] = add i64 3, [[N_VEC3]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i8 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP8]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND9:%.*]] = phi <16 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX11:%.*]] = add i64 3, [[INDEX8]] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX11]], 0 +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX10:%.*]] = add i64 3, [[INDEX7]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX10]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <16 x i8> [[VEC_IND9]], ptr [[TMP11]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX8]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <16 x i8> [[VEC_IND9]], splat (i8 16) -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]] +; CHECK-NEXT: store <4 x i8> [[VEC_IND8]], ptr [[TMP11]], align 1 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N7]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -128,18 +128,18 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD_2]], splat (i64 3) ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD_3]], splat (i64 3) ; CHECK-NEXT: [[TMP12:%.*]] = sitofp <2 x i64> [[TMP8]] to <2 x float> -; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float> -; CHECK-NEXT: [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float> -; CHECK-NEXT: [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float> -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 6 -; CHECK-NEXT: store <2 x float> [[TMP12]], ptr [[TMP20]], align 4 -; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP21]], align 4 -; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[TMP22]], align 4 -; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float> +; CHECK-NEXT: [[TMP19:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float> +; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float> +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 6 +; CHECK-NEXT: store <2 x float> [[TMP12]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP18]], ptr [[TMP15]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP19]], ptr [[TMP16]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP20]], ptr [[TMP17]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 6f67d1e283e395..9f6362b3616859 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -68,25 +68,27 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-LABEL: @PR27826( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: preheader: +; CHECK-NEXT: br i1 [[CMP]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[T0:%.*]] = sext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[T0]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 5 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 32 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 32 @@ -201,9 +203,9 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[TMP113:%.*]] = insertelement <4 x float> [[TMP112]], float [[TMP109]], i32 2 ; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x float> [[TMP113]], float [[TMP110]], i32 3 ; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <4 x float> [[TMP42]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <4 x float> [[TMP50]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <4 x float> [[TMP58]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <4 x float> [[TMP66]], [[VEC_PHI4]] ; CHECK-NEXT: [[TMP119]] = fadd fast <4 x float> [[TMP115]], [[TMP90]] ; CHECK-NEXT: [[TMP120]] = fadd fast <4 x float> [[TMP116]], [[TMP98]] ; CHECK-NEXT: [[TMP121]] = fadd fast <4 x float> [[TMP117]], [[TMP106]] @@ -213,18 +215,72 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP120]], [[TMP119]] -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX4]] -; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX5]]) +; CHECK-NEXT: [[BIN_RDX5:%.*]] = fadd fast <4 x float> [[TMP121]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <4 x float> [[TMP122]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP124]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF7]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC8]], 32 +; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP125]], [[VEC_EPILOG_PH]] ], [ [[TMP155:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX12:%.*]] = mul i64 [[INDEX10]], 32 +; CHECK-NEXT: [[TMP126:%.*]] = add i64 [[OFFSET_IDX12]], 0 +; CHECK-NEXT: [[TMP127:%.*]] = add i64 [[OFFSET_IDX12]], 32 +; CHECK-NEXT: [[TMP128:%.*]] = add i64 [[OFFSET_IDX12]], 64 +; CHECK-NEXT: [[TMP129:%.*]] = add i64 [[OFFSET_IDX12]], 96 +; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP126]] +; CHECK-NEXT: [[TMP131:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP127]] +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP128]] +; CHECK-NEXT: [[TMP133:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP129]] +; CHECK-NEXT: [[TMP134:%.*]] = load float, ptr [[TMP130]], align 4 +; CHECK-NEXT: [[TMP135:%.*]] = load float, ptr [[TMP131]], align 4 +; CHECK-NEXT: [[TMP136:%.*]] = load float, ptr [[TMP132]], align 4 +; CHECK-NEXT: [[TMP137:%.*]] = load float, ptr [[TMP133]], align 4 +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <4 x float> poison, float [[TMP134]], i32 0 +; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x float> [[TMP138]], float [[TMP135]], i32 1 +; CHECK-NEXT: [[TMP140:%.*]] = insertelement <4 x float> [[TMP139]], float [[TMP136]], i32 2 +; CHECK-NEXT: [[TMP141:%.*]] = insertelement <4 x float> [[TMP140]], float [[TMP137]], i32 3 +; CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP126]] +; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP127]] +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP128]] +; CHECK-NEXT: [[TMP145:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP129]] +; CHECK-NEXT: [[TMP146:%.*]] = load float, ptr [[TMP142]], align 4 +; CHECK-NEXT: [[TMP147:%.*]] = load float, ptr [[TMP143]], align 4 +; CHECK-NEXT: [[TMP148:%.*]] = load float, ptr [[TMP144]], align 4 +; CHECK-NEXT: [[TMP149:%.*]] = load float, ptr [[TMP145]], align 4 +; CHECK-NEXT: [[TMP150:%.*]] = insertelement <4 x float> poison, float [[TMP146]], i32 0 +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <4 x float> [[TMP150]], float [[TMP147]], i32 1 +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x float> [[TMP151]], float [[TMP148]], i32 2 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x float> [[TMP152]], float [[TMP149]], i32 3 +; CHECK-NEXT: [[TMP154:%.*]] = fadd fast <4 x float> [[TMP141]], [[VEC_PHI11]] +; CHECK-NEXT: [[TMP155]] = fadd fast <4 x float> [[TMP154]], [[TMP153]] +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP156:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP156]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP157:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP155]]) +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N14]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] -; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] +; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[BC_MERGE_RDX15]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] @@ -233,9 +289,9 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[ADD4]] = fadd fast float [[ADD]], [[T2]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]] -; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: loopexit: -; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ], [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD4_LCSSA]], [[LOOPEXIT]] ] @@ -314,21 +370,21 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META4:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META5:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META7:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT13]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT14]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = and <2 x i1> [[TMP17]], [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i1> [[TMP19]] to <2 x i8> -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1 -; CHECK-NEXT: store i8 [[TMP23]], ptr [[DST]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT10]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8> +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1 +; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -351,7 +407,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64 ; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]] -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -432,7 +488,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP27]], [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]]) @@ -452,7 +508,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[ANY_OF_NEXT]] = select i1 [[CMP13_NOT_NOT]], i1 [[ANY_OF]], i1 false ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 40 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i1 [[ANY_OF_NEXT_LCSSA]] @@ -533,10 +589,13 @@ exit: define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-LABEL: @cost_duplicate_recipe_for_sinking( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 @@ -544,7 +603,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 @@ -578,147 +637,206 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1 -; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: +; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK: pred.store.if8: ; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2 ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK: pred.store.continue9: ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK: pred.store.if10: ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK: pred.store.continue11: ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; CHECK: pred.store.if11: +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; CHECK: pred.store.if12: ; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2 ; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK: pred.store.continue13: ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 -; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] -; CHECK: pred.store.if13: +; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; CHECK: pred.store.if14: ; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.continue14: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] +; CHECK: pred.store.continue15: ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 -; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] -; CHECK: pred.store.if15: +; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: ; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2 ; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] -; CHECK: pred.store.continue16: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] -; CHECK: pred.store.if17: +; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: ; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] -; CHECK: pred.store.continue18: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] -; CHECK: pred.store.if19: +; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; CHECK: pred.store.if20: ; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2 ; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] -; CHECK: pred.store.continue20: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] -; CHECK: pred.store.if21: +; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; CHECK: pred.store.if22: ; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] -; CHECK: pred.store.continue22: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] +; CHECK: pred.store.continue23: ; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1 -; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] -; CHECK: pred.store.if23: +; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; CHECK: pred.store.if24: ; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2 ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] -; CHECK: pred.store.continue24: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] +; CHECK: pred.store.continue25: ; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2 -; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] -; CHECK: pred.store.if25: +; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; CHECK: pred.store.if26: ; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] -; CHECK: pred.store.continue26: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] +; CHECK: pred.store.continue27: ; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] -; CHECK: pred.store.if27: +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; CHECK: pred.store.if28: ; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2 ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] -; CHECK: pred.store.continue28: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] +; CHECK: pred.store.continue29: ; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] -; CHECK: pred.store.if29: +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; CHECK: pred.store.if30: ; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] -; CHECK: pred.store.continue30: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] +; CHECK: pred.store.continue31: ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] -; CHECK: pred.store.if31: +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; CHECK: pred.store.if32: ; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] +; CHECK: pred.store.continue33: ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 -; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] -; CHECK: pred.store.if33: +; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] +; CHECK: pred.store.if34: ; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2 ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]] -; CHECK: pred.store.continue34: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] +; CHECK: pred.store.continue35: ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]] -; CHECK: pred.store.if35: +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.if36: ; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2 ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]] -; CHECK: pred.store.continue36: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.continue37: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0 +; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]] +; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ] +; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0 +; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[TMP86]], 2 +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP87]] +; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8 +; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0 +; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; CHECK: pred.store.if43: +; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2 +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] +; CHECK: pred.store.continue44: +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; CHECK: pred.store.if45: +; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1 +; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2 +; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] +; CHECK: pred.store.continue46: +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2 +; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; CHECK: pred.store.if47: +; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2 +; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2 +; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]] +; CHECK: pred.store.continue48: +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3 +; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.if49: +; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3 +; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.continue50: +; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4 +; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]] +; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]] ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8 @@ -731,7 +849,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -796,31 +914,31 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP9]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX5]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX6]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP17:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP18]] = add i64 [[TMP17]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP12]] = add i64 [[TMP15]], 1 ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 ; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[N]], 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[C]]) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP18]], [[LOOP]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP12]], [[LOOP]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[DOTLCSSA]] ; entry: @@ -861,16 +979,16 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-NEXT: store i32 [[TMP14]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: store i32 [[TMP10]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] @@ -886,7 +1004,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 29 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -936,7 +1054,7 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[TMP1]]) ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -950,7 +1068,7 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: [[RED_MUL]] = mul nsw i64 [[RED]], [[P_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RES]] @@ -997,7 +1115,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1014,7 +1132,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK-NEXT: [[RED_MUL]] = mul i64 [[SHL]], [[RED]] ; CHECK-NEXT: [[IV_NEXT_I_I_I]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_MUL_LCSSA:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RED_MUL_LCSSA]] @@ -1061,7 +1179,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1> @@ -1080,7 +1198,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[OR_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index 9092da6092595a..e94bd841360256 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -11,7 +11,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX1]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV_START]] to i32 @@ -57,40 +57,40 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 8 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]] ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[IV_START]], [[N_VEC4]] ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32 -; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i32> poison, i32 [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT10]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION12:%.*]] = add <8 x i32> [[DOTSPLAT11]], +; CHECK-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION11:%.*]] = add <4 x i32> [[DOTSPLAT10]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND13:%.*]] = phi <8 x i32> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX15:%.*]] = add i64 [[IV_START]], [[INDEX9]] -; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[OFFSET_IDX15]] to i32 +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <4 x i32> [ [[INDUCTION11]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX14:%.*]] = add i64 [[IV_START]], [[INDEX8]] +; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[OFFSET_IDX14]] to i32 ; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 0 ; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], -1 -; CHECK-NEXT: [[TMP24:%.*]] = mul <8 x i32> [[VEC_IND13]], splat (i32 196608) -; CHECK-NEXT: [[TMP25:%.*]] = lshr exact <8 x i32> [[TMP24]], splat (i32 16) -; CHECK-NEXT: [[TMP26:%.*]] = trunc <8 x i32> [[TMP25]] to <8 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = mul <4 x i32> [[VEC_IND12]], splat (i32 196608) +; CHECK-NEXT: [[TMP25:%.*]] = lshr exact <4 x i32> [[TMP24]], splat (i32 16) +; CHECK-NEXT: [[TMP26:%.*]] = trunc <4 x i32> [[TMP25]] to <4 x i16> ; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP23]] to i64 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[ARR]], i64 [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[TMP28]], i32 0 -; CHECK-NEXT: store <8 x i16> [[TMP26]], ptr [[TMP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX9]], 8 -; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <8 x i32> [[VEC_IND13]], splat (i32 8) -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC4]] +; CHECK-NEXT: store <4 x i16> [[TMP26]], ptr [[TMP29]], align 2 +; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX8]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT13]] = add <4 x i32> [[VEC_IND12]], splat (i32 4) +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -110,6 +110,8 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK: exit: ; CHECK-NEXT: ret void ; + + entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index 39fa87fbb569f6..43366314f52477 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -18,11 +18,14 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-LABEL: @fp_iv_loop1( ; AUTO_VEC-NEXT: entry: ; AUTO_VEC-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; AUTO_VEC-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AUTO_VEC: for.body.preheader: +; AUTO_VEC-NEXT: br i1 [[CMP4]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]] +; AUTO_VEC: iter.check: ; AUTO_VEC-NEXT: [[ZEXT:%.*]] = zext nneg i32 [[N]] to i64 -; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 32 -; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AUTO_VEC: vector.main.loop.iter.check: +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 32 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[ZEXT]], 2147483616 ; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float @@ -49,16 +52,46 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] +; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AUTO_VEC: vec.epilog.iter.check: +; AUTO_VEC-NEXT: [[DOTCAST7:%.*]] = uitofp nneg i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP6:%.*]] = fmul fast float [[DOTCAST7]], 5.000000e-01 +; AUTO_VEC-NEXT: [[IND_END8:%.*]] = fadd fast float [[TMP6]], 1.000000e+00 +; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[ZEXT]], 28 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]] +; AUTO_VEC: vec.epilog.ph: +; AUTO_VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[N_VEC3:%.*]] = and i64 [[ZEXT]], 2147483644 +; AUTO_VEC-NEXT: [[DOTCAST5:%.*]] = uitofp nneg i64 [[N_VEC3]] to float +; AUTO_VEC-NEXT: [[TMP7:%.*]] = fmul fast float [[DOTCAST5]], 5.000000e-01 +; AUTO_VEC-NEXT: [[IND_END6:%.*]] = fadd fast float [[TMP7]], 1.000000e+00 +; AUTO_VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[BC_RESUME_VAL]], i64 0 +; AUTO_VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; AUTO_VEC-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], +; AUTO_VEC-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AUTO_VEC: vec.epilog.vector.body: +; AUTO_VEC-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND11:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX10]] +; AUTO_VEC-NEXT: store <4 x float> [[VEC_IND11]], ptr [[TMP8]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT12]] = fadd fast <4 x float> [[VEC_IND11]], splat (float 2.000000e+00) +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC3]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTO_VEC: vec.epilog.middle.block: +; AUTO_VEC-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[N_VEC3]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[FOR_BODY]] ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; AUTO_VEC-NEXT: store float [[X_06]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[TMP6]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -164,7 +197,7 @@ define void @fp_iv_loop2(ptr noalias nocapture %A, i32 %N) { ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP5:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -221,7 +254,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], splat (double 4.800000e+01) ; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; AUTO_VEC-NEXT: [[IND_ESCAPE:%.*]] = fadd fast double [[TMP0]], -3.000000e+00 @@ -234,7 +267,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; AUTO_VEC-NEXT: [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] -; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -322,7 +355,7 @@ define double @external_use_without_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP9:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -357,11 +390,14 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-LABEL: @fadd_reassoc_FMF( ; AUTO_VEC-NEXT: entry: ; AUTO_VEC-NEXT: [[CMP_NOT11:%.*]] = icmp eq i32 [[N:%.*]], 0 -; AUTO_VEC-NEXT: br i1 [[CMP_NOT11]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AUTO_VEC: for.body.preheader: +; AUTO_VEC-NEXT: br i1 [[CMP_NOT11]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]] +; AUTO_VEC: iter.check: ; AUTO_VEC-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 32 -; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AUTO_VEC: vector.main.loop.iter.check: +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 32 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 ; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float @@ -379,13 +415,13 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 64 ; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 96 ; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 -; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 -; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 -; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 +; AUTO_VEC-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; AUTO_VEC-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 +; AUTO_VEC-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] -; AUTO_VEC-NEXT: [[TMP7:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] -; AUTO_VEC-NEXT: [[TMP8:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] -; AUTO_VEC-NEXT: [[TMP9:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] +; AUTO_VEC-NEXT: [[TMP7:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD2]] +; AUTO_VEC-NEXT: [[TMP8:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD3]] +; AUTO_VEC-NEXT: [[TMP9:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD4]] ; AUTO_VEC-NEXT: store <8 x float> [[TMP6]], ptr [[TMP2]], align 4 ; AUTO_VEC-NEXT: store <8 x float> [[TMP7]], ptr [[TMP3]], align 4 ; AUTO_VEC-NEXT: store <8 x float> [[TMP8]], ptr [[TMP4]], align 4 @@ -393,23 +429,55 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <8 x float> [[STEP_ADD3]], splat (float 3.360000e+02) ; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AUTO_VEC: vec.epilog.iter.check: +; AUTO_VEC-NEXT: [[DOTCAST10:%.*]] = uitofp nneg i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul reassoc float [[DOTCAST10]], 4.200000e+01 +; AUTO_VEC-NEXT: [[IND_END11:%.*]] = fadd reassoc float [[TMP11]], 1.000000e+00 +; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 28 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY]], label [[VEC_EPILOG_PH]] +; AUTO_VEC: vec.epilog.ph: +; AUTO_VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi float [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AUTO_VEC-NEXT: [[N_VEC6:%.*]] = and i64 [[TMP0]], 4294967292 +; AUTO_VEC-NEXT: [[DOTCAST8:%.*]] = uitofp nneg i64 [[N_VEC6]] to float +; AUTO_VEC-NEXT: [[TMP12:%.*]] = fmul reassoc float [[DOTCAST8]], 4.200000e+01 +; AUTO_VEC-NEXT: [[IND_END9:%.*]] = fadd reassoc float [[TMP12]], 1.000000e+00 +; AUTO_VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[BC_RESUME_VAL]], i64 0 +; AUTO_VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; AUTO_VEC-NEXT: [[INDUCTION:%.*]] = fadd reassoc <4 x float> [[DOTSPLAT]], +; AUTO_VEC-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; AUTO_VEC: vec.epilog.vector.body: +; AUTO_VEC-NEXT: [[INDEX13:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND14:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[INDEX13]] +; AUTO_VEC-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP13]], align 4 +; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <4 x float> [[VEC_IND14]], [[WIDE_LOAD16]] +; AUTO_VEC-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX13]], 4 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT15]] = fadd reassoc <4 x float> [[VEC_IND14]], splat (float 1.680000e+02) +; AUTO_VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC6]] +; AUTO_VEC-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; AUTO_VEC: vec.epilog.middle.block: +; AUTO_VEC-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[N_VEC6]], [[TMP0]] +; AUTO_VEC-NEXT: br i1 [[CMP_N18]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; AUTO_VEC: for.cond.cleanup: ; AUTO_VEC-NEXT: ret void ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP11]] +; AUTO_VEC-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP16]] ; AUTO_VEC-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index f415a81d3b7c73..bc74742d952abe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -6,8 +6,11 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-LABEL: define i32 @iv_used_widened_and_truncated( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -36,21 +39,53 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8) ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <8 x i32> [[STEP_ADD7]], splat (i32 8) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC9:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF8]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[BC_RESUME_VAL7]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT14]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION16:%.*]] = add <4 x i32> [[DOTSPLAT15]], ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND17:%.*]] = phi <4 x i32> [ [[INDUCTION16]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT18:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <4 x i64> [[VEC_IND12]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT13]] = add <4 x i64> [[VEC_IND12]], splat (i64 4) +; CHECK-NEXT: [[VEC_IND_NEXT18]] = add <4 x i32> [[VEC_IND17]], splat (i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N20]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: store i32 [[T]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -96,18 +131,18 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[TMP4]], i32 4 -; CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[TMP6]], align 2, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] -; CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[TMP7]], align 2, !alias.scope [[META4]], !noalias [[META7]] +; CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[TMP6]], align 2, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] +; CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[TMP7]], align 2, !alias.scope [[META5]], !noalias [[META8]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 -; CHECK-NEXT: store <4 x i32> [[VEC_IND3]], ptr [[TMP10]], align 4, !alias.scope [[META7]] -; CHECK-NEXT: store <4 x i32> [[STEP_ADD4]], ptr [[TMP11]], align 4, !alias.scope [[META7]] +; CHECK-NEXT: store <4 x i32> [[VEC_IND3]], ptr [[TMP10]], align 4, !alias.scope [[META8]] +; CHECK-NEXT: store <4 x i32> [[STEP_ADD4]], ptr [[TMP11]], align 4, !alias.scope [[META8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4) ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], splat (i32 4) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -124,7 +159,7 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) { ; CHECK-NEXT: store i32 [[IV_32]], ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 64 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -171,7 +206,7 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i16> [[STEP_ADD]], splat (i16 8) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -186,7 +221,7 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) { ; CHECK-NEXT: store i16 [[SEL]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 64 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -266,50 +301,50 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META13:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META14:![0-9]+]] ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP22]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1, !alias.scope [[META13]] +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1, !alias.scope [[META14]] ; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> ; CHECK-NEXT: [[TMP22]] = add <16 x i32> [[TMP19]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP22]], <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1) ; CHECK-NEXT: [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8> ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0 -; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META16:![0-9]+]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META17:![0-9]+]], !noalias [[META14]] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1 -; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2 -; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3 -; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4 -; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5 -; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6 -; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7 -; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8 -; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9 -; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10 -; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11 -; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12 -; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13 -; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14 -; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15 -; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META16]], !noalias [[META13]] +; CHECK-NEXT: store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META17]], !noalias [[META14]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184 -; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP22]], i32 15 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -339,7 +374,7 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[DEC]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[OUTPTR_0]] = getelementptr i8, ptr [[PTR_IV_1]], i64 2 ; CHECK-NEXT: [[CMP30_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP30_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP30_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -388,7 +423,7 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 @@ -405,7 +440,7 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV_NEXT]] to i16 ; CHECK-NEXT: [[REC_NEXT]] = mul i16 [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i16 [[REC_LCSSA]] @@ -430,8 +465,10 @@ exit: define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-LABEL: define i32 @test_scalar_predicated_cost( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer @@ -439,8 +476,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT4]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], splat (i64 8) @@ -473,12 +510,42 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8) ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], splat (i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100 +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp ule i64 [[IV]], [[Y]] @@ -492,7 +559,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -573,7 +640,7 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -585,7 +652,7 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -671,7 +738,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -688,7 +755,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -745,7 +812,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -763,7 +830,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[MUL3]] to i60 ; CHECK-NEXT: [[TRUNC_1]] = trunc i60 [[TRUNC_0]] to i32 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -822,7 +889,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -840,7 +907,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 { ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65 ; CHECK-NEXT: [[TRUNC]] = trunc i64 [[MUL3]] to i32 ; CHECK-NEXT: [[DEAD_AND:%.*]] = and i32 [[TRUNC]], 123 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -872,27 +939,27 @@ attributes #1 = { "target-cpu"="skylake-avx512" "target-features"="-avx512f" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"} -; CHECK: [[META7]] = !{[[META8:![0-9]+]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]} -; CHECK: [[META13]] = !{[[META14:![0-9]+]]} -; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"} -; CHECK: [[META16]] = !{[[META17:![0-9]+]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} -; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} -; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} -; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} -; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[META5]] = !{[[META6:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], !"LVerDomain"} +; CHECK: [[META8]] = !{[[META9:![0-9]+]]} +; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[META14]] = !{[[META15:![0-9]+]]} +; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]} +; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"} +; CHECK: [[META17]] = !{[[META18:![0-9]+]]} +; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} ; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} @@ -901,4 +968,6 @@ attributes #1 = { "target-cpu"="skylake-avx512" "target-features"="-avx512f" } ; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} ; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index b5effe73fa73d2..4ff5dc1bc91c69 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -27,85 +27,85 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 -; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD4]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD5]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD6]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 96 +; CHECK-NEXT: store <16 x i16> [[TMP4]], ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP8]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 -; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2 -; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP15]] -; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP13]] +; CHECK-NEXT: [[DOTCAST9:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST9]] ; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967288 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC15]] to i32 -; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 -; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC15]], 1 -; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP17]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT31]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC8:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC8]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC8]], 1 +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP15]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT23]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[INDEX27]], 1 -; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[INDEX27]], 1 -; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <8 x i16>, ptr [[NEXT_GEP28]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD30]], <8 x i16> [[BROADCAST_SPLAT32]]) -; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[NEXT_GEP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 8 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDEX17:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX18]] +; CHECK-NEXT: [[OFFSET_IDX20:%.*]] = shl i64 [[INDEX17]], 1 +; CHECK-NEXT: [[NEXT_GEP21:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX20]] +; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i16>, ptr [[NEXT_GEP19]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD22]], <8 x i16> [[BROADCAST_SPLAT24]]) +; CHECK-NEXT: store <8 x i16> [[TMP16]], ptr [[NEXT_GEP21]], align 2 +; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX17]], 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] +; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC8]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2 -; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP19:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP18]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2 -; CHECK-NEXT: store i16 [[TMP23]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP19]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -142,7 +142,7 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 128 @@ -155,22 +155,22 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 -; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD3]], <32 x i8> [[WIDE_LOAD3]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD4]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD5]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 96 +; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP2]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP7]], ptr [[TMP10]], align 2 @@ -181,45 +181,45 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END21:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[DOTCAST17:%.*]] = trunc nuw i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END18:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST17]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 112 +; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[DOTCAST8:%.*]] = trunc nuw i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 120 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC15:%.*]] = and i64 [[TMP0]], 4294967280 -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC15]] to i32 -; CHECK-NEXT: [[IND_END16:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC15]] -; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC15]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT31]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[TMP0]], 4294967288 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc nuw i64 [[N_VEC7]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] +; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC7]] +; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP28:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX27]] -; CHECK-NEXT: [[NEXT_GEP29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX27]] -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <16 x i8>, ptr [[NEXT_GEP28]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD30]], <16 x i8> [[WIDE_LOAD30]], <16 x i8> [[BROADCAST_SPLAT32]]) -; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[NEXT_GEP29]], align 2 -; CHECK-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX27]], 16 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC15]] +; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX16]] +; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX16]] +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, ptr [[NEXT_GEP17]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i8> @llvm.fshl.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]]) +; CHECK-NEXT: store <8 x i8> [[TMP12]], ptr [[NEXT_GEP18]], align 2 +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX16]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC15]], [[TMP0]] -; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N_VEC7]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP_N23]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi ptr [ [[IND_END20]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 ; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 ; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP14]], i8 [[TMP14]], i8 [[OFFSET]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index 2fb4a68f4b5860..58c6827128feb2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -62,25 +62,25 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX2]], 9223372036854775800 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[BC_MERGE_RDX]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX15]] -; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META8:![0-9]+]] -; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI16]], [[WIDE_LOAD17]] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <8 x i32> [ [[TMP11]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX14]] +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, ptr [[TMP12]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP13]] = add <8 x i32> [[VEC_PHI15]], [[WIDE_LOAD16]] ; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META8]] -; CHECK-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX15]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC13]] +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX14]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) -; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] @@ -126,7 +126,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 @@ -162,32 +162,53 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT17]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[SMAX2]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]] -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META21:![0-9]+]], !noalias [[META24:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD12]], [[BROADCAST_SPLAT14]] -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT16]], ptr [[TMP4]], align 4, !alias.scope [[META21]], !noalias [[META24]] -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[BROADCAST_SPLAT16]], <8 x ptr> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP5]]), !alias.scope [[META24]] -; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[PRED_STORE_CONTINUE21:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]] +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META21:![0-9]+]], !noalias [[META24:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]] +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT15]], ptr [[TMP4]], align 4, !alias.scope [[META21]], !noalias [[META24]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.if20: +; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[A]], align 4, !alias.scope [[META24]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX10]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[CMP_N10]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] +; CHECK-NEXT: br i1 [[CMP_N23]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -291,28 +312,28 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX10]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT21]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT23]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT26]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT23:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT22]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT25]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX19]] -; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD20]], [[BROADCAST_SPLAT22]] -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT24]], ptr [[TMP5]], align 4, !alias.scope [[META37]], !noalias [[META40]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX19]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META43:![0-9]+]] -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD25]], <8 x ptr> [[BROADCAST_SPLAT27]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META44:![0-9]+]], !noalias [[META43]] -; CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX19]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC17]] +; CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX18]] +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD19]], [[BROADCAST_SPLAT21]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT23]], ptr [[TMP5]], align 4, !alias.scope [[META37]], !noalias [[META40]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX18]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META43:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD24]], <8 x ptr> [[BROADCAST_SPLAT26]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META44:![0-9]+]], !noalias [[META43]] +; CHECK-NEXT: [[INDEX_NEXT27]] = add nuw i64 [[INDEX18]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC17]] ; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] -; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] +; CHECK-NEXT: br i1 [[CMP_N28]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index c6dfefe5c9ad4f..8688b246c60f4e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -205,48 +205,69 @@ exit: define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: @test_tc_20( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 64 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 64 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 8 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 12 -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[TMP16]], align 64 -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD1]], ptr [[TMP17]], align 64 -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD2]], ptr [[TMP18]], align 64 -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD3]], ptr [[TMP19]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 64 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 64 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 64 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 12 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], ptr [[TMP7]], align 64 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD1]], ptr [[TMP8]], align 64 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD2]], ptr [[TMP9]], align 64 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP14]], align 64 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP16]], align 64 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20 +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]] ; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64 ; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]] ; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64 ; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1 ; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20 -; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -284,7 +305,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -298,7 +319,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ; CHECK-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll index e945ad40039af8..7cf36fc669f37d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll @@ -6,8 +6,10 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-LABEL: define i32 @test_scalar_predicated_cost( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer @@ -15,8 +17,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT4]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], splat (i64 8) @@ -51,10 +53,40 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL1]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], splat (i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100 +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp ule i64 [[IV]], [[Y]] @@ -68,7 +100,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 100 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 ; @@ -115,7 +147,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c) ; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP1]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -133,7 +165,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c) ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 116 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -189,7 +221,7 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <4 x i1> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -210,7 +242,7 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 116 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -244,9 +276,10 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 11383aad4e90bd..206bbdf262b727 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -76,7 +76,7 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 @@ -88,10 +88,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 @@ -132,10 +134,34 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ] +; AVX2-NEXT: [[TMP37:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP29]], align 4 +; AVX2-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison) +; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP30]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] @@ -152,7 +178,7 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -226,16 +252,16 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP29]], align 4 -; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP29]], align 4 +; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100) ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison) -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison) +; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP30]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX512: vec.epilog.middle.block: @@ -351,7 +377,7 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1_addrspace1( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 @@ -363,10 +389,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0 @@ -405,12 +433,36 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ] +; AVX2-NEXT: [[TMP37:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP38]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP29]], align 4 +; AVX2-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison) +; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP37]] +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP30]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]] @@ -427,7 +479,7 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP7:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -501,16 +553,16 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP28]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP29]], align 4 -; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP29]], align 4 +; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100) ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison) -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison) +; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP27]] ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP30]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX512: vec.epilog.middle.block: @@ -637,7 +689,7 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo2( -; AVX2-NEXT: entry: +; AVX2-NEXT: iter.check: ; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 ; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64 ; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 @@ -649,10 +701,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128 ; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] ; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2: vector.main.loop.iter.check: +; AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: -; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 @@ -695,12 +749,37 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AVX2: middle.block: -; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ] +; AVX2-NEXT: [[TMP42:%.*]] = add i64 [[INDEX11]], 0 +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP42]] +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4 +; AVX2-NEXT: [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP42]] +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP36]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison) +; AVX2-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> +; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP42]] +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP34]]) +; AVX2-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX2-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 +; AVX2-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] @@ -718,7 +797,7 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP10:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -796,17 +875,17 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP31:%.*]] = add i64 [[INDEX11]], 0 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP31]] ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4 -; AVX512-NEXT: [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100) +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP33]], align 4 +; AVX512-NEXT: [[TMP34:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP31]] ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0 -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP36]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison) -; AVX512-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> -; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP36]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison) +; AVX512-NEXT: [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]] ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP31]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0 -; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP34]]) -; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP34]]) +; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16 ; AVX512-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000 ; AVX512-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AVX512: vec.epilog.middle.block: @@ -875,96 +954,184 @@ for.end: ; preds = %for.inc ;} define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 { -; AVX-LABEL: @foo3( -; AVX-NEXT: entry: -; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; AVX: vector.memcheck: -; AVX-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 -; AVX-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 -; AVX-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 -; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] -; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] -; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] -; AVX-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] -; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; AVX-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; AVX: vector.ph: -; AVX-NEXT: br label [[VECTOR_BODY:%.*]] -; AVX: vector.body: -; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] -; AVX-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12 -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META8:![0-9]+]] -; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META8]] -; AVX-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8]] -; AVX-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META8]] -; AVX-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 100) -; AVX-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], splat (i32 100) -; AVX-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100) -; AVX-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100) -; AVX-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0 -; AVX-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12 -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]] -; AVX-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]] -; AVX-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]] -; AVX-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]] -; AVX-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> -; AVX-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> -; AVX-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> -; AVX-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]] -; AVX-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]] -; AVX-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]] -; AVX-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 -; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8 -; AVX-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12 -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]] -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]] -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]] -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]] -; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] -; AVX: middle.block: -; AVX-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX: scalar.ph: -; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; AVX-NEXT: br label [[FOR_BODY:%.*]] -; AVX: for.body: -; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100 -; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; AVX: if.then: -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double -; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]] -; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] -; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 -; AVX-NEXT: br label [[FOR_INC]] -; AVX: for.inc: -; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; AVX: for.end: -; AVX-NEXT: ret void +; AVX1-LABEL: @foo3( +; AVX1-NEXT: entry: +; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX1: vector.memcheck: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; AVX1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX1: vector.ph: +; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META8:![0-9]+]] +; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META8]] +; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 100) +; AVX1-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], splat (i32 100) +; AVX1-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100) +; AVX1-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100) +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0 +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4 +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8 +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]] +; AVX1-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX1-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> +; AVX1-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> +; AVX1-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> +; AVX1-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]] +; AVX1-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]] +; AVX1-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]] +; AVX1-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]] +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0 +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8 +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]] +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; AVX1: middle.block: +; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX1: scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @foo3( +; AVX2-NEXT: entry: +; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX2: vector.memcheck: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 +; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000 +; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] +; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] +; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]] +; AVX2-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 100) +; AVX2-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], splat (i32 100) +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100) +; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100) +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META14:![0-9]+]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META14]] +; AVX2-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX2-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double> +; AVX2-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double> +; AVX2-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double> +; AVX2-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]] +; AVX2-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]] +; AVX2-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]] +; AVX2-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0 +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8 +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]] +; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX2-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX2: middle.block: +; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX2: scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX2: for.end: +; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo3( -; AVX512-NEXT: entry: +; AVX512-NEXT: iter.check: ; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000 @@ -978,10 +1145,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512: vector.main.loop.iter.check: +; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 @@ -1026,10 +1195,35 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 ; AVX512-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; AVX512: middle.block: -; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[TMP40:%.*]] = add i64 [[INDEX12]], 0 +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP40]] +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP31]], align 4, !alias.scope [[META20:![0-9]+]] +; AVX512-NEXT: [[TMP32:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100) +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP40]] +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP33]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP34]], i32 8, <8 x i1> [[TMP32]], <8 x double> poison), !alias.scope [[META23:![0-9]+]] +; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> +; AVX512-NEXT: [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP35]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP40]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP37]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP36]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP32]]), !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]] +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8 +; AVX512-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] @@ -1047,7 +1241,7 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP29:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1137,19 +1331,19 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META21:![0-9]+]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META30:![0-9]+]] ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100) ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], splat (i64 1) ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META24:![0-9]+]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META33:![0-9]+]] ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]] ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META26:![0-9]+]], !noalias [[META28:![0-9]+]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META35:![0-9]+]], !noalias [[META37:![0-9]+]] ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128) ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: @@ -1173,7 +1367,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1273,13 +1467,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -3 ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -12 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3 -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META18:![0-9]+]] +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META21:![0-9]+]] ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META18]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META18]] +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META18]] +; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META21]] ; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer ; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer @@ -1295,16 +1489,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -12 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -3 ; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META21:![0-9]+]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META24:![0-9]+]] ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META21]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META21]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META21]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META24]] ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], splat (double 5.000000e-01) ; AVX2-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], splat (double 5.000000e-01) @@ -1320,16 +1514,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -12 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -3 ; AVX2-NEXT: [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META23:![0-9]+]], !noalias [[META25:![0-9]+]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META26:![0-9]+]], !noalias [[META28:![0-9]+]] ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META23]], !noalias [[META25]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META26]], !noalias [[META28]] ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META23]], !noalias [[META25]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META26]], !noalias [[META28]] ; AVX2-NEXT: [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META23]], !noalias [[META25]] +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META26]], !noalias [[META28]] ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: @@ -1351,7 +1545,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1385,13 +1579,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -7 ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -24 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7 -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META31:![0-9]+]] +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META40:![0-9]+]] ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META31]] +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META31]] +; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META31]] +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META40]] ; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer @@ -1407,16 +1601,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7 ; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META34:![0-9]+]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META43:![0-9]+]] ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META34]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META34]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META34]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META43]] ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01) ; AVX512-NEXT: [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01) @@ -1432,16 +1626,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7 ; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META36:![0-9]+]], !noalias [[META38:![0-9]+]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META45:![0-9]+]], !noalias [[META47:![0-9]+]] ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META36]], !noalias [[META38]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META45]], !noalias [[META47]] ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META36]], !noalias [[META38]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META45]], !noalias [[META47]] ; AVX512-NEXT: [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META36]], !noalias [[META38]] +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META45]], !noalias [[META47]] ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: @@ -1463,7 +1657,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1507,8 +1701,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: @@ -1574,12 +1771,45 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX1-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 +; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) +; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]]) +; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -1597,7 +1827,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -1607,8 +1837,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: entry: ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: +; AVX2: iter.check: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: @@ -1671,15 +1904,48 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX2-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 +; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) +; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]]) +; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -1697,7 +1963,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP33:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -1707,8 +1973,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: entry: ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX512: for.body.preheader: +; AVX512: iter.check: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: @@ -1771,15 +2040,48 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1 +; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true) +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true) +; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]]) +; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8 +; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -1797,7 +2099,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP52:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -1852,8 +2154,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX1: for.body.preheader: +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: @@ -1916,15 +2221,48 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX1-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 +; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) +; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) +; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]]) +; AVX1-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX1-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -1942,7 +2280,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP23:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -1952,8 +2290,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: entry: ; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX2: for.body.preheader: +; AVX2: iter.check: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: @@ -2016,15 +2357,48 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX2-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX2-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 +; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) +; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) +; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]]) +; AVX2-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 +; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX2: vec.epilog.scalar.ph: +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: -; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -2042,7 +2416,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP36:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2052,8 +2426,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: entry: ; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; AVX512: for.body.preheader: +; AVX512: iter.check: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512: vector.main.loop.iter.check: ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: @@ -2116,15 +2493,48 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX512: vec.epilog.iter.check: +; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX512: vec.epilog.ph: +; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[N_MOD_VF8:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; AVX512-NEXT: [[N_VEC9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF8]] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: vec.epilog.vector.body: +; AVX512-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[INDEX10]], 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1 +; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1) +; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true) +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 +; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison) +; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true) +; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]]) +; AVX512-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8 +; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; AVX512: vec.epilog.middle.block: +; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] +; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512: vec.epilog.scalar.ph: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 @@ -2142,7 +2552,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP55:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index 6bd70cefcaf746..89febbccfd8d29 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -8,12 +8,13 @@ target triple = "x86_64-unknown-linux-gnu" define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrspace(1) align 8 dereferenceable_or_null(8), i64) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[PREHEADER:%.*]] -; CHECK: preheader: +; CHECK-NEXT: br label [[ITER_CHECK:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[DOT10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 16 ; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2:%.*]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 16 @@ -22,9 +23,12 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr addrspace(1) [[DOT10]], [[SCEVGEP1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr addrspace(1) [[DOT12]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[UMAX2]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -33,35 +37,55 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 64 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 96 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope [[META0]] -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope [[META0]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope [[META0]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope [[META0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 32 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 96 ; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD6]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[UMAX2]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_VEC8:%.*]] = and i64 [[UMAX2]], -4 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDEX9]] +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP14]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX9]] +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD10]], ptr addrspace(1) [[TMP15]], align 8, !alias.scope [[META11:![0-9]+]], !noalias [[META8]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX9]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N12]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[DOT18:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDVARS_IV3]] ; CHECK-NEXT: [[V:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[DOT18]], align 8 ; CHECK-NEXT: [[DOT20:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDVARS_IV3]] ; CHECK-NEXT: store ptr addrspace(1) [[V]], ptr addrspace(1) [[DOT20]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 ; CHECK-NEXT: [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]] -; CHECK-NEXT: br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: loopexit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 4517583965ea4f..7b1c7ae94ff41a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -173,8 +173,11 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AVX1: for.body.preheader: +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX1: vector.main.loop.iter.check: ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: @@ -265,12 +268,49 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY]] ] +; AVX1-NEXT: [[TMP67:%.*]] = add i64 [[INDEX26]], 0 +; AVX1-NEXT: [[TMP68:%.*]] = shl nuw nsw i64 [[TMP67]], 1 +; AVX1-NEXT: [[TMP69:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP68]] +; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP69]], align 2 +; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32> +; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP68]] +; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP54]], align 2 +; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32> +; AVX1-NEXT: [[TMP70:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP53]] +; AVX1-NEXT: [[TMP71:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32> +; AVX1-NEXT: [[TMP72:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32> +; AVX1-NEXT: [[TMP73:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP71]] +; AVX1-NEXT: [[TMP74:%.*]] = add nsw <4 x i32> [[TMP73]], [[TMP70]] +; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP67]] +; AVX1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i32 0 +; AVX1-NEXT: store <4 x i32> [[TMP74]], ptr [[TMP76]], align 4 +; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4 +; AVX1-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[TMP77]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] ; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] ; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 @@ -292,7 +332,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 8321cbcb5a1d80..6f769f48bada30 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) local_unnamed_addr #0 { ; CHECK-LABEL: @japi1_vect_42283( -; CHECK-NEXT: top: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1:%.*]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = load atomic ptr, ptr @jlplt_ijl_alloc_array_1d_10294_got unordered, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = tail call ptr addrspace(10) [[TMP3]](ptr addrspace(10) null, i64 0) @@ -19,6 +19,9 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[DOTELT1:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(10) [[TMP5]], i64 0, i32 1 ; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[TBAA8]] ; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[TOP:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP17:%.*]] = icmp ult i64 [[TMP8]], 16 ; CHECK-NEXT: br i1 [[TMP17]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -54,22 +57,53 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], splat (i64 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[L44:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP8]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TOP]] ] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF4]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x ptr addrspace(10)> [[BROADCAST_SPLATINSERT10]], <4 x ptr addrspace(10)> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i64> poison, i64 [[DOTUNPACK2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT12]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[L26:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[L26]] ] +; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[L26]] ] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> [[TMP29]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[TBAA10]] +; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4) +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[L26]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N15]], label [[L44]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[L27:%.*]] ; CHECK: L26: -; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[L26]] ] +; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], [[L27]] ] ; CHECK-NEXT: [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0 ; CHECK-NEXT: store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[TBAA10]] ; CHECK-NEXT: [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1 ; CHECK-NEXT: store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[TBAA10]] ; CHECK-NEXT: [[TMP27]] = add i64 [[VALUE_PHI5]], 1 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]] -; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L26]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[DOTNOT]], label [[L44]], label [[L27]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: L44: ; CHECK-NEXT: ret ptr addrspace(10) null ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index 9c42c3769ec177..246cc1b345ad41 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -11,18 +11,20 @@ target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: norecurse nounwind readonly uwtable define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 { ; CHECK-LABEL: @matrix_row_col( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -180,17 +182,55 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]] ; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]]) -; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP172:%.*]] = add i64 [[INDEX9]], 0 +; CHECK-NEXT: [[TMP173:%.*]] = add i64 [[INDEX9]], 1 +; CHECK-NEXT: [[TMP174:%.*]] = add i64 [[INDEX9]], 2 +; CHECK-NEXT: [[TMP175:%.*]] = add i64 [[INDEX9]], 3 +; CHECK-NEXT: [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]] +; CHECK-NEXT: [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0 +; CHECK-NEXT: [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1 +; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2 +; CHECK-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> [[TMP164]], i32 [[TMP161]], i32 3 +; CHECK-NEXT: [[TMP166:%.*]] = mul nsw <4 x i32> [[TMP165]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP167:%.*]] = add <4 x i32> [[VEC_PHI10]], splat (i32 4) +; CHECK-NEXT: [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]] +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4 +; CHECK-NEXT: [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100 +; CHECK-NEXT: br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]]) +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] @@ -200,21 +240,23 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] ; ; MAX-BW-LABEL: @matrix_row_col( -; MAX-BW-NEXT: entry: +; MAX-BW-NEXT: iter.check: ; MAX-BW-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 ; MAX-BW-NEXT: [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64 ; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; MAX-BW: vector.main.loop.iter.check: +; MAX-BW-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; MAX-BW: vector.ph: ; MAX-BW-NEXT: br label [[VECTOR_BODY:%.*]] ; MAX-BW: vector.body: -; MAX-BW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; MAX-BW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ] -; MAX-BW-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ] -; MAX-BW-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ] -; MAX-BW-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ] +; MAX-BW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ] ; MAX-BW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; MAX-BW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; MAX-BW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -372,17 +414,55 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]] ; MAX-BW-NEXT: [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]] ; MAX-BW-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]]) -; MAX-BW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; MAX-BW: scalar.ph: -; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-BW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; MAX-BW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; MAX-BW: vec.epilog.iter.check: +; MAX-BW-NEXT: br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; MAX-BW: vec.epilog.ph: +; MAX-BW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; MAX-BW-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; MAX-BW-NEXT: [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; MAX-BW-NEXT: br label [[FOR_BODY:%.*]] +; MAX-BW: vec.epilog.vector.body: +; MAX-BW-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], [[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[TMP172:%.*]] = add i64 [[INDEX9]], 0 +; MAX-BW-NEXT: [[TMP173:%.*]] = add i64 [[INDEX9]], 1 +; MAX-BW-NEXT: [[TMP174:%.*]] = add i64 [[INDEX9]], 2 +; MAX-BW-NEXT: [[TMP175:%.*]] = add i64 [[INDEX9]], 3 +; MAX-BW-NEXT: [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]] +; MAX-BW-NEXT: [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0 +; MAX-BW-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP157:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP175]], i64 [[IDXPROM5]] +; MAX-BW-NEXT: [[TMP158:%.*]] = load i32, ptr [[TMP154]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP159:%.*]] = load i32, ptr [[TMP155]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP160:%.*]] = load i32, ptr [[TMP156]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP161:%.*]] = load i32, ptr [[TMP157]], align 4, !tbaa [[TBAA1]] +; MAX-BW-NEXT: [[TMP162:%.*]] = insertelement <4 x i32> poison, i32 [[TMP158]], i32 0 +; MAX-BW-NEXT: [[TMP163:%.*]] = insertelement <4 x i32> [[TMP162]], i32 [[TMP159]], i32 1 +; MAX-BW-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP163]], i32 [[TMP160]], i32 2 +; MAX-BW-NEXT: [[TMP165:%.*]] = insertelement <4 x i32> [[TMP164]], i32 [[TMP161]], i32 3 +; MAX-BW-NEXT: [[TMP166:%.*]] = mul nsw <4 x i32> [[TMP165]], [[WIDE_LOAD11]] +; MAX-BW-NEXT: [[TMP167:%.*]] = add <4 x i32> [[VEC_PHI10]], splat (i32 4) +; MAX-BW-NEXT: [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]] +; MAX-BW-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4 +; MAX-BW-NEXT: [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100 +; MAX-BW-NEXT: br i1 [[TMP169]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; MAX-BW: vec.epilog.middle.block: +; MAX-BW-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]]) +; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; MAX-BW: vec.epilog.scalar.ph: +; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; MAX-BW-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] +; MAX-BW-NEXT: br label [[FOR_BODY1:%.*]] ; MAX-BW: for.cond.cleanup: -; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; MAX-BW-NEXT: ret i32 [[ADD7_LCSSA]] ; MAX-BW: for.body: -; MAX-BW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; MAX-BW-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; MAX-BW-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY1]] ] ; MAX-BW-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] ; MAX-BW-NEXT: [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] ; MAX-BW-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] @@ -392,7 +472,7 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; MAX-BW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; MAX-BW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; MAX-BW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; MAX-BW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] ; entry: %idxprom = sext i32 %i to i64 @@ -476,7 +556,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: store i8 [[TMP35]], ptr [[TMP27]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 -; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -496,7 +576,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -596,7 +676,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: store i8 [[TMP67]], ptr [[TMP51]], align 1 ; MAX-BW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; MAX-BW-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 -; MAX-BW-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; MAX-BW-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; MAX-BW: middle.block: ; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; MAX-BW: scalar.ph: @@ -616,7 +696,7 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW-NEXT: store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1 ; MAX-BW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2 ; MAX-BW-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024 -; MAX-BW-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]] +; MAX-BW-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] ; MAX-BW: for.cond.cleanup: ; MAX-BW-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 9feeee004025c9..cf913a4fcc7a1b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -20,58 +20,81 @@ target triple = "x86_64-unknown-linux" ; define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP23:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-NEXT: store <4 x float> [[TMP20]], ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: store <4 x float> [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: store <4 x float> [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: store <4 x float> [[TMP13]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: store <4 x float> [[TMP14]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <4 x float> [[WIDE_LOAD9]], [[WIDE_LOAD10]] +; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20 +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP25]], [[TMP26]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP23]], [[TMP24]] ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -117,15 +140,15 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6:![0-9]+]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -134,14 +157,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -182,15 +205,15 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -199,14 +222,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP6]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index a054d87997b1b5..cd95e9fba94387 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -33,19 +33,20 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: entry: ; AVX2-NEXT: [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]] ; AVX2-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12_PREHEADER:%.*]] -; AVX2: bb12.preheader: +; AVX2: iter.check: ; AVX2-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64 ; AVX2-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64 ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[END3]], -4 ; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START4]] ; AVX2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 ; AVX2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; AVX2-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP1]], 28 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[BB12_PREHEADER1:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX2: vector.main.loop.iter.check: ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: ; AVX2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 -; AVX2-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 2 -; AVX2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -79,12 +80,39 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] -; AVX2-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[BB12_PREHEADER11]] -; AVX2: bb12.preheader8: -; AVX2-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AVX2-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX2: vec.epilog.iter.check: +; AVX2-NEXT: [[TMP26:%.*]] = shl i64 [[N_VEC]], 2 +; AVX2-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]] +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]] +; AVX2: vec.epilog.ph: +; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX2-NEXT: [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800 +; AVX2-NEXT: [[TMP21:%.*]] = shl i64 [[N_VEC10]], 2 +; AVX2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP21]] ; AVX2-NEXT: br label [[BB12:%.*]] +; AVX2: vec.epilog.vector.body: +; AVX2-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[BB12_PREHEADER11]] ], [ [[INDEX_NEXT16:%.*]], [[BB12]] ] +; AVX2-NEXT: [[OFFSET_IDX13:%.*]] = shl i64 [[INDEX12]], 2 +; AVX2-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX13]] +; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, ptr [[NEXT_GEP14]], align 4 +; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 -12) +; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 13) +; AVX2-NEXT: [[TMP24:%.*]] = or <8 x i1> [[TMP22]], [[TMP23]] +; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP14]], i32 4, <8 x i1> [[TMP24]]) +; AVX2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 8 +; AVX2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]] +; AVX2-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2: vec.epilog.middle.block: +; AVX2-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] +; AVX2-NEXT: br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]] +; AVX2: bb12.preheader: +; AVX2-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AVX2-NEXT: br label [[BB13:%.*]] ; AVX2: bb12: -; AVX2-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER11]] ] +; AVX2-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER1]] ] ; AVX2-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4 ; AVX2-NEXT: switch i32 [[VAL]], label [[LATCH]] [ ; AVX2-NEXT: i32 -12, label [[STORE:%.*]] @@ -96,7 +124,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2: latch: ; AVX2-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 4 ; AVX2-NEXT: [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]] -; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: exit: ; AVX2-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index ac4b56b0fd1b11..2ba2935e07e2f4 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -16,15 +16,18 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.preheader: +; CHECK: iter.check: ; CHECK-NEXT: [[X4:%.*]] = ptrtoint ptr [[X:%.*]] to i64 ; CHECK-NEXT: [[Y5:%.*]] = ptrtoint ptr [[Y:%.*]] to i64 ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X4]], [[Y5]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[MIN_ITERS_CHECK]], i1 true, i1 [[DIFF_CHECK]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY_PREHEADER9:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i64 0 @@ -35,7 +38,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 32 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 64 @@ -61,10 +64,35 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] -; CHECK: for.body.preheader9: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY_PREHEADER9]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = fdiv fast <4 x double> splat (double 1.000000e+00), [[BROADCAST_SPLAT15]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX12]] +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x double>, ptr [[TMP39]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul fast <4 x double> [[WIDE_LOAD13]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX12]] +; CHECK-NEXT: store <4 x double> [[TMP40]], ptr [[TMP41]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 4 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP43]], 7 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] ; CHECK: for.body.prol.preheader: @@ -81,13 +109,13 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 ; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] -; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.body.prol.loopexit: ; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] ; CHECK-NEXT: [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8 ; CHECK-NEXT: br i1 [[TMP21]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]] -; CHECK: for.body.preheader9.new: +; CHECK: for.body.preheader.new: ; CHECK-NEXT: [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] @@ -148,7 +176,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;