Skip to content

Commit

Permalink
[VPlan] Introduce new entry block to VPlan for early SCEV expansion.
Browse files Browse the repository at this point in the history
This patch adds a new preheader block the VPlan to place SCEV expansions
expansions like the trip count. This preheader block is disconnected
at the moment, as the bypass blocks of the skeleton are not yet modeled
in VPlan.

The preheader block is executed before skeleton creation, so the SCEV
expansion results can be used during skeleton creation. At the moment,
the trip count expression and induction steps are expanded in the new
preheader. The remainder of SCEV expansions will be moved gradually in
the future.

D147965 will update skeleton creation to use the steps expanded in the
pre-header to fix #58811.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D147964
  • Loading branch information
fhahn committed May 4, 2023
1 parent f19f749 commit b85a402
Show file tree
Hide file tree
Showing 28 changed files with 418 additions and 232 deletions.
126 changes: 61 additions & 65 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,14 @@ class InnerLoopVectorizer {
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});

/// Returns the original loop trip count.
Value *getTripCount() const { return TripCount; }

/// Used to set the trip count after ILV's construction and after the
/// preheader block has been executed. Note that this always holds the trip
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }

protected:
friend class LoopVectorizationPlanner;

Expand Down Expand Up @@ -605,9 +613,6 @@ class InnerLoopVectorizer {
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);

/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(BasicBlock *InsertBlock);

/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

Expand Down Expand Up @@ -2869,41 +2874,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
PredicatedInstructions.push_back(Cloned);
}

Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
if (TripCount)
return TripCount;

assert(InsertBlock);
IRBuilder<> Builder(InsertBlock->getTerminator());
// Find the loop boundaries.
Type *IdxTy = Legal->getWidestInductionType();
assert(IdxTy && "No type for induction");
const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE, OrigLoop);

const DataLayout &DL = InsertBlock->getModule()->getDataLayout();

// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*PSE.getSE(), DL, "induction");

// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
InsertBlock->getTerminator());

if (TripCount->getType()->isPointerTy())
TripCount =
CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
InsertBlock->getTerminator());

return TripCount;
}

Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;

Value *TC = getOrCreateTripCount(InsertBlock);
Value *TC = getTripCount();
IRBuilder<> Builder(InsertBlock->getTerminator());

Type *Ty = TC->getType();
Expand Down Expand Up @@ -2981,7 +2957,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
}

void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
Expand Down Expand Up @@ -3241,7 +3217,7 @@ void InnerLoopVectorizer::createInductionResumeValues(

BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// The trip counts should be cached by now.
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
Value *Count = getTripCount();
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);

auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
Expand Down Expand Up @@ -3281,8 +3257,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
the vectorized instructions while the old loop will continue to run the
scalar remainder.

[ ] <-- loop iteration number check.
/ |
[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
/ | preheader are expanded here. Eventually all required SCEV
/ | expansion should happen here.
/ v
| [ ] <-- vector loop bypass (may consist of multiple blocks).
| / |
Expand Down Expand Up @@ -3384,7 +3361,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
assert(StepVPV && "step must have been expanded during VPlan execution");
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
: State.get(StepVPV, 0);
: State.get(StepVPV, {0, 0});
Value *Escape =
emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
Expand Down Expand Up @@ -7704,23 +7681,27 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');

// Workaround! Compute the trip count of the original loop and cache it
// before we start modifying the CFG. This code has a systemic problem
// wherein it tries to run analysis over partially constructed IR; this is
// wrong, and not simply for SCEV. The trip count of the original loop
// simply happens to be prone to hitting this in practice. In theory, we
// can hit the same issue for any SCEV, or ValueTracking query done during
// mutation. See PR49900.
ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());

if (!IsEpilogueVectorization)
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

// Perform the actual loop transformation.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};

// 0. Generate SCEV-dependent code into the preheader, including TripCount,
// before making any changes to the CFG.
if (!BestVPlan.getPreheader()->empty()) {
State.CFG.PrevBB = OrigLoop->getLoopPreheader();
State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
BestVPlan.getPreheader()->execute(&State);
}
if (!ILV.getTripCount())
ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
else
assert(IsEpilogueVectorization && "should only re-use the existing trip "
"count during epilogue vectorization");

// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
Value *CanonicalIVStartValue;
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
ILV.createVectorizedLoopSkeleton();
Expand Down Expand Up @@ -7756,10 +7737,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
ILV.getOrCreateVectorTripCount(nullptr),
CanonicalIVStartValue, State,
IsEpilogueVectorization);
BestVPlan.prepareToExecute(
ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
CanonicalIVStartValue, State, IsEpilogueVectorization);

BestVPlan.execute(&State);

Expand Down Expand Up @@ -7874,7 +7854,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
assert(Bypass && "Expected valid bypass basic block.");
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
Expand Down Expand Up @@ -8193,7 +8173,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
if (useActiveLaneMask(TFStyle)) {
VPValue *TC = Plan.getOrCreateTripCount();
VPValue *TC = Plan.getTripCount();
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
nullptr, "active.lane.mask");
} else {
Expand Down Expand Up @@ -8770,7 +8750,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
VecPreheader->appendRecipe(CanonicalIVIncrementParts);

// Create the ActiveLaneMask instruction using the correct start values.
VPValue *TC = Plan.getOrCreateTripCount();
VPValue *TC = Plan.getTripCount();

VPValue *TripCount, *IncrementValue;
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
Expand Down Expand Up @@ -8912,17 +8892,19 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------

// Create initial VPlan skeleton, starting with a block for the pre-header,
// followed by a region for the vector loop, followed by the middle block. The
// skeleton vector loop region contains a header and latch block.
VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
auto Plan = std::make_unique<VPlan>(Preheader);

// Create initial VPlan skeleton, having a basic block for the pre-header
// which contains SCEV expansions that need to happen before the CFG is
// modified; a basic block for the vector pre-header, followed by a region for
// the vector loop, followed by the middle basic block. The skeleton vector
// loop region contains a header and latch basic blocks.
VPlanPtr Plan = VPlan::createInitialVPlan(
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
*PSE.getSE());
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);

Expand Down Expand Up @@ -9110,7 +9092,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");

// Create new empty VPlan
auto Plan = std::make_unique<VPlan>();
auto Plan = VPlan::createInitialVPlan(
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
*PSE.getSE());

// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
Expand Down Expand Up @@ -9831,9 +9815,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
// At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
// VPExpandSCEVRecipes can also be uniform.
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
Expand Down Expand Up @@ -10420,6 +10406,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");

// Re-use the trip count expanded for the main loop, as skeleton
// creation needs it as a value that dominates both the scalar and
// vector epilogue loops
EpilogILV.setTripCount(MainILV.getTripCount());
if (auto *R = BestEpiPlan.getTripCount()->getDefiningRecipe()) {
assert(BestEpiPlan.getTripCount()->getNumUsers() == 0 &&
"trip count VPValue cannot be used in epilogue plan");
R->eraseFromParent();
}

// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
// before vectorizing the epilogue loop.
Expand Down
53 changes: 26 additions & 27 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
}

void VPBlockBase::setPlan(VPlan *ParentPlan) {
assert(ParentPlan->getEntry() == this &&
"Can only set plan on its entry block.");
assert(
(ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
"Can only set plan on its entry or preheader block.");
Plan = ParentPlan;
}

Expand Down Expand Up @@ -593,12 +594,19 @@ VPlan::~VPlan() {
}
for (VPValue *VPV : VPLiveInsToFree)
delete VPV;
if (TripCount)
delete TripCount;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
}

VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
VPBasicBlock *Preheader = new VPBasicBlock("ph");
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
Plan->TripCount =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
return Plan;
}

VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
Expand All @@ -612,13 +620,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
VPTransformState &State,
bool IsEpilogueVectorization) {

// Check if the trip count is needed, and if so build it.
if (TripCount && TripCount->getNumUsers()) {
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
State.set(TripCount, TripCountV, Part);
}

// Check if the backedge taken count is needed, and if so build it.
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
Expand Down Expand Up @@ -747,30 +748,29 @@ void VPlan::print(raw_ostream &O) const {

O << "VPlan '" << getName() << "' {";

bool AnyLiveIn = false;
if (VectorTripCount.getNumUsers() > 0) {
O << "\nLive-in ";
VectorTripCount.printAsOperand(O, SlotTracker);
O << " = vector-trip-count";
AnyLiveIn = true;
}

if (TripCount && TripCount->getNumUsers() > 0) {
O << "\nLive-in ";
TripCount->printAsOperand(O, SlotTracker);
O << " = original trip-count";
AnyLiveIn = true;
}

if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
O << "\nLive-in ";
BackedgeTakenCount->printAsOperand(O, SlotTracker);
O << " = backedge-taken count";
AnyLiveIn = true;
}

if (AnyLiveIn)
O << "\n";
if (TripCount->isLiveIn())
O << "Live-in ";
TripCount->printAsOperand(O, SlotTracker);
O << " = original trip-count";
O << "\n";

if (!getPreheader()->empty()) {
O << "\n";
getPreheader()->print(O, "", SlotTracker);
}

for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) {
O << '\n';
Expand Down Expand Up @@ -897,6 +897,8 @@ void VPlanPrinter::dump() {
OS << "edge [fontname=Courier, fontsize=30]\n";
OS << "compound=true\n";

dumpBlock(Plan.getPreheader());

for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
dumpBlock(Block);

Expand Down Expand Up @@ -1109,8 +1111,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
assignSlot(Plan.BackedgeTakenCount);
if (Plan.TripCount)
assignSlot(Plan.TripCount);
assignSlots(Plan.getPreheader());

ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
Expand Down Expand Up @@ -1140,10 +1141,8 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
else if (auto *E = dyn_cast<SCEVUnknown>(Expr))
Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
else {

VPBasicBlock *Preheader = Plan.getEntry();
Expanded = new VPExpandSCEVRecipe(Expr, SE);
Preheader->appendRecipe(Expanded->getDefiningRecipe());
Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
}
Plan.addSCEVExpansion(Expr, Expanded);
return Expanded;
Expand Down
Loading

0 comments on commit b85a402

Please sign in to comment.