Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AIEX] Improve iterative scheduling convergence strategy #212

Merged
merged 5 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 4 additions & 39 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,9 +568,7 @@ class WAWStickyRegistersEdges : public ScheduleDAGMutation {
BitVector AllRegs(RI->getNumRegs());
AllRegs.reset();
// Here, we analyze which sticky registers are explicitly redefined
// or read. We also track all instructions implicitly reading or
// defining such registers.
std::map<const Register, SmallVector<const MachineInstr *, 16>> RegMIsMap;
// or read.
for (const MachineInstr &MI : make_range(DAG->begin(), DAG->end())) {
for (const MachineOperand &MOP : MI.operands()) {
if (!MOP.isReg())
Expand All @@ -580,47 +578,14 @@ class WAWStickyRegistersEdges : public ScheduleDAGMutation {
if (!Reg.isPhysical() || !RI->isReservedStickyReg(Reg))
continue;

if ((!MOP.isImplicit() && (MOP.isDef() || MOP.readsReg())) ||
(MOP.isImplicit() && MOP.readsReg())) {
if (MOP.readsReg() || (!MOP.isImplicit() && MOP.isDef())) {
AllRegs.set(Reg);

} else if (MOP.isImplicit() && MOP.isDef()) {
// Instruction that could have a dependency removal.
// We track it because of the next heuristic.
RegMIsMap[Reg].push_back(&MI);
}
}
}

auto IsLoad = [&](const MachineInstr *MI) -> bool { return MI->mayLoad(); };
auto IsStore = [&](const MachineInstr *MI) -> bool {
return MI->mayStore();
};

// This is the heuristic component. We catch basically cases where
// registers are only defined by loads or store within a region,
// For example, cases like exemplified below (region):
// [sequence non-defining sticky regs. instructions.]
// VST.CONV ...
// VST.CONV ...
// VST.CONV ...
// VST.CONV ...
// In this case, by removing dependencies between pairs of VST.CONVs,
// we give too much freedom to the scheduler to do good, but also
// not good choices. In this way, we filter those cases off.
for (auto RMIs : RegMIsMap) {
const Register Reg = RMIs.first;
SmallVector<const MachineInstr *, 16> &MIs = RMIs.second;
// The first thing to test is the tuning parameter: we only consider
// cases where the number of memory ops are <= the threshold.
if (MIs.size() <= WAWStickyRegistersMemOpsThreshold &&
((all_of(MIs, IsLoad) || all_of(MIs, IsStore))))
AllRegs.set(Reg);
}

// Next part is to drop all output latencies related to
// registers that are not explicitly read or defined also
// considering the heuristically filtered cases.
// Next part is to drop all output dependencies related to
// registers that are not explicitly read
for (SUnit &SU : DAG->SUnits) {
for (const SDep &Dep : getPreds(SU)) {
if (Dep.getKind() != SDep::Kind::Output)
Expand Down
12 changes: 9 additions & 3 deletions llvm/lib/Target/AIE/AIEHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,13 @@ bool AIEHazardRecognizer::checkConflict(
MemoryBanks);
for (auto Cycles : MemoryAccessCycles) {
// MemoryAccessCycles starts counting from 1, so we need to subtract 1
if (MemoryBankAccessCycle.conflict(Scoreboard[DeltaCycles + Cycles - 1]))
int AccessCycle = DeltaCycles + Cycles - 1;
if (MemoryBankAccessCycle.conflict(Scoreboard[AccessCycle])) {
LLVM_DEBUG(dbgs() << "*** Memory bank conflict in cycle=" << AccessCycle
<< ":\n";
MemoryBankAccessCycle.dump(); dbgs() << "\n");
return true;
}
}
}

Expand All @@ -496,8 +501,9 @@ bool AIEHazardRecognizer::checkConflict(
assert(StageCycle < Scoreboard.getDepth());

if (ThisCycle.conflict(Scoreboard[StageCycle])) {
LLVM_DEBUG(dbgs() << "*** Hazard in execution cycle"
<< StageCycle - DeltaCycles << ", ");
LLVM_DEBUG(dbgs() << "*** Hazard in cycle=" << StageCycle
<< " EC=" << StageCycle - DeltaCycles << ":\n";
ThisCycle.dump(); dbgs() << "\n");
return true;
}
}
Expand Down
60 changes: 42 additions & 18 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,41 @@ void dumpInterBlock(const InterBlockEdges &Edges) {
}
}

void emitBundlesTopDown(const std::vector<MachineBundle> &Bundles,
ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
AIEHazardRecognizer *HR) {
ResourceScoreboard<FuncUnitWrapper>
createTopDownScoreboard(ArrayRef<MachineBundle> Bundles,
const AIEHazardRecognizer &HR) {
ResourceScoreboard<FuncUnitWrapper> Scoreboard;
Scoreboard.reset(HR.getMaxLookAhead());

const int TotalBundles = Bundles.size();
const int AmountToEmit = std::min(TotalBundles, HR->getConflictHorizon());
const int AmountToEmit = std::min(TotalBundles, HR.getConflictHorizon());
// Do not emit more than the specified by the conflict horizon. More
// then this will not cause conflicts.
for (int I = TotalBundles - AmountToEmit; I < TotalBundles; I++) {
for (MachineInstr *MI : Bundles[I].getInstrs())
HR->emitInScoreboard(Scoreboard, MI->getDesc(), HR->getMemoryBanks(MI),
MI->operands(), MI->getMF()->getRegInfo(), 0);
HR.emitInScoreboard(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI),
MI->operands(), MI->getMF()->getRegInfo(), 0);
Scoreboard.advance();
}

DEBUG_LOOPAWARE(dbgs() << "*** Emitted " << TotalBundles << " top-down\n");

// If an iteration contains less bundles than the number of resources that
// stick out into the next one, this means that the first cycles of the
// scoreboard could potentially be "clobbered" by previous iterations.
// We conservatively block those cycles.
const int MaxResourceExtent = Scoreboard.lastOccupied();
assert(MaxResourceExtent <= HR.getConflictHorizon());
if (MaxResourceExtent > AmountToEmit) {
const int NumBlockedCycles = MaxResourceExtent - AmountToEmit;
const int FirstBlockedCycle = -AmountToEmit;
const int LastBlockedCycle = FirstBlockedCycle + NumBlockedCycles - 1;
for (int C = FirstBlockedCycle; C <= LastBlockedCycle; ++C) {
Scoreboard[C].blockResources();
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this NFC?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned it in the commit, in the context of how this was used, this is an NFC.


return Scoreboard;
}

ResourceScoreboard<FuncUnitWrapper>
Expand Down Expand Up @@ -112,22 +133,25 @@ createBottomUpScoreboard(ArrayRef<MachineBundle> Bundles,
/// from \p PredBundles that is responsible for it.
///
/// \pre The bundles contain no multi-slot pseudo.
MachineInstr *
checkResourceConflicts(const ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
const std::vector<MachineBundle> &PredBundles,
const AIEHazardRecognizer &HR) {
MachineInstr *checkResourceConflictsBottomUp(
const ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
const std::vector<MachineBundle> &PredBundles,
const AIEHazardRecognizer &HR) {
DEBUG_LOOPAWARE(dbgs() << "Interblock Successor scoreboard:\n";
Scoreboard.dump());

int BottomUpCycle = 0;
for (const MachineBundle &B : reverse(PredBundles)) {
if (BottomUpCycle >= HR.getConflictHorizon())
break;
for (MachineInstr *MI : B.getInstrs()) {
if (BottomUpCycle >= HR.getConflictHorizon())
break;
if (HR.getHazardType(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI),
MI->operands(), MI->getMF()->getRegInfo(),
-BottomUpCycle))
-BottomUpCycle)) {
DEBUG_LOOPAWARE(dbgs() << "Conflicting MI at Bottom-up cycle="
<< BottomUpCycle << ": " << *MI);
return MI;
}
}
++BottomUpCycle;
}
Expand Down Expand Up @@ -327,17 +351,16 @@ MachineInstr *InterBlockScheduling::resourcesConverged(BlockState &BS) const {

// We are a single-block loop body. Check that there is no resource conflict
// on the backedge, by overlaying top and bottom region
if (MachineInstr *MICausingConflict = checkResourceConflicts(
if (MachineInstr *MICausingConflict = checkResourceConflictsBottomUp(
createBottomUpScoreboard(BS.getTop().Bundles, *HR),
BS.getBottom().Bundles, *HR))
return MICausingConflict;

// Bottom represents the resources that are sticking out of the block.
// The last non-empty cycle is a safe upperbound for the resource
// safety margin.
ResourceScoreboard<FuncUnitWrapper> Bottom;
Bottom.reset(HR->getMaxLookAhead());
emitBundlesTopDown(BS.getBottom().Bundles, Bottom, HR.get());
ResourceScoreboard<FuncUnitWrapper> Bottom =
createTopDownScoreboard(BS.getBottom().Bundles, *HR);
BS.FixPoint.MaxResourceExtent = Bottom.lastOccupied();
return nullptr;
}
Expand Down Expand Up @@ -828,7 +851,8 @@ int InterBlockScheduling::getCyclesToAvoidResourceConflicts(

// Increment the number of intermediate nops until there are no resource
// conflicts between the last iteration of the loop and the epilogue.
while (checkResourceConflicts(Scoreboard, LoopBS.getBottom().Bundles, *HR)) {
while (checkResourceConflictsBottomUp(Scoreboard, LoopBS.getBottom().Bundles,
*HR)) {
Scoreboard.recede();
++NopCounter;
}
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,58 +98,58 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: mova r0, #0 // Delay Slot 1
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_2: // %entry.new
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; nopb ; nopx ; mov dc0, #0; nops
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; nopx ; mov dc0, #0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc4, dc0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0; mov crUPSSign, r4
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; mov s1, r2
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm8, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0; vadd cm6, cm1, cm0, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; mov s1, r2
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm3, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; movx r6, #-4; vadd cm4, cm4, cm2, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; and r1, r1, r6; vadd cm3, cm5, cm3, r0
; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3
; ASM-NEXT: add r1, r1, #-4; mov s0, r5
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32; jz r1, #.LBB0_5
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32; vadd cm8, cm8, cm2, r0 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm7, cm1, cm0, r0 // Delay Slot 4
; ASM-NEXT: nop // Delay Slot 3
; ASM-NEXT: nop // Delay Slot 2
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; movx r6, #-4; vadd cm4, cm4, cm1, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; and r1, r1, r6; vadd cm6, cm6, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; add r1, r1, #-4; vadd cm1, cm7, cm5, r0
; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3; vadd cm8, cm3, cm0, r0
; ASM-NEXT: jz r1, #.LBB0_5
; ASM-NEXT: mov s0, r5 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 2
; ASM-NEXT: nop // Delay Slot 1
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_3: // %for.body
; ASM-NEXT: // =>This Inner Loop Header: Depth=1
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm2, s1, [p1], m1; nops ; nopxm ; nopv
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nopx ; vadd cm5, cm6, cm5, r0
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4
; ASM-NEXT: vst.srs.d8.s32 cm7, s0, [p3], #32; jnz r1, #.LBB0_3; vadd cm3, cm4, cm3, r0
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm7, cm1, cm0, r0 // Delay Slot 2
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm6, cm2, r0 // Delay Slot 1
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm5, cm5, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; vadd cm2, cm7, cm4, r0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; vadd cm3, cm6, cm3, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; add r1, r1, #-4
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32; jnz r1, #.LBB0_3
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: nop // Delay Slot 3
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32 // Delay Slot 2
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm8, cm1, cm0, r0 // Delay Slot 1
; ASM-NEXT: // %bb.4:
; ASM-NEXT: nopa ; nopxm
; ASM-NEXT: nop
; ASM-NEXT: nopa ; nopb ; nopxm
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_5:
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm5, cm6, cm5, r0
; ASM-NEXT: vadd cm3, cm4, cm3, r0
; ASM-NEXT: nopa ; nopb ; nopx ; vadd cm4, cm7, cm4, r0
; ASM-NEXT: vadd cm3, cm6, cm3, r0
; ASM-NEXT: vadd cm2, cm5, cm2, r0
; ASM-NEXT: vadd cm0, cm1, cm0, r0
; ASM-NEXT: nop
; ASM-NEXT: nop
; ASM-NEXT: vst.srs.d8.s32 cm7, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm8, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm5, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov crUPSSign, #0
; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32; mov r6, dc0
; ASM-NEXT: mov r0, dc4
Expand Down