-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF function #116247
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: David Sherwood (david-arm) ChangesThere are lots of places where we try to estimate the runtime Full diff: https://github.com/llvm/llvm-project/pull/116247.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ebc62f9843905..32adf9032e9c25 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4300,6 +4300,16 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
return TTI.getVScaleForTuning();
}
+static unsigned getEstimatedRuntimeVF(const Loop *L,
+ const TargetTransformInfo &TTI,
+ ElementCount VF) {
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
+ EstimatedVF *= *VScale;
+ return EstimatedVF;
+}
+
bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
@@ -4596,17 +4606,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
- unsigned AssumedMinimumVscale =
- getVScaleForTuning(OrigLoop, TTI).value_or(1);
- unsigned Width =
- Candidate.Width.isScalable()
- ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
- : Candidate.Width.getFixedValue();
+ unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
- << AssumedMinimumVscale << ")");
+ << getVScaleForTuning(OrigLoop, TTI).value_or(1)
+ << ")");
LLVM_DEBUG(dbgs() << ".\n");
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4687,12 +4693,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- unsigned Multiplier = 1;
- if (VF.isScalable())
- Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
- if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
- return true;
- return false;
+ return getEstimatedRuntimeVF(TheLoop, TTI, VF) >= EpilogueVectorizationMinVF;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4744,12 +4745,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
- ElementCount EstimatedRuntimeVF = MainLoopVF;
- if (MainLoopVF.isScalable()) {
- EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
- EstimatedRuntimeVF *= *VScale;
- }
+ ElementCount EstimatedRuntimeVF =
+ ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
@@ -4976,11 +4973,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- unsigned EstimatedVF = VF.getKnownMinValue();
- if (VF.isScalable()) {
- if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
- EstimatedVF *= *VScale;
- }
+ unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@@ -9776,8 +9769,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
}
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
- VectorizationFactor &VF,
- std::optional<unsigned> VScale, Loop *L,
+ VectorizationFactor &VF, Loop *L,
+ const TargetTransformInfo &TTI,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
@@ -9829,13 +9822,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
- unsigned IntVF = VF.Width.getKnownMinValue();
- if (VF.Width.isScalable()) {
- unsigned AssumedMinimumVscale = 1;
- if (VScale)
- AssumedMinimumVscale = *VScale;
- IntVF *= AssumedMinimumVscale;
- }
+ unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
uint64_t RtC = *CheckCost.getValue();
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10084,8 +10071,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
- !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
- PSE, SEL)) {
+ !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
Whilst rebasing PR llvm#116247 I discovered an issue where PR llvm#108190 seems to have unintentionally introduced an unfairness in selecting epilogue VFs by making potentially better choices for fixed-width VFs compared to scalable VFs. When considering whether epilogue vectorisation is profitable or not the latest algorithm appears to be: bool IsProfitable = false; if (VF.isFixed()) IsProfitable = (IC * VF.getFixedValue()) >= EpilogueVectorizationMinVF; else IsProfitable = (getVScaleForTuning() * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF; Instead, the estimate for the number of scalar iterations processed in the main vector loop should be (IC * estimatedRuntimeVF)
5196c41
to
d992006
Compare
Rebase + added a TODO in isEpilogueVectorizationProfitable |
There are lots of places where we try to estimate the runtime vectorisation factor based on the getVScaleForTuning TTI hook. I've added a new getEstimatedRuntimeVF function and taught several places in the vectoriser to use this new function.
5207e97
to
d1c9f52
Compare
Rebase. Updated |
There are lots of places where we try to estimate the runtime
vectorisation factor based on the getVScaleForTuning TTI hook.
I've added a new getEstimatedRuntimeVF function and taught
several places in the vectoriser to use this new function.