Skip to content

Commit

Permalink
[AIEX] New convergence strategy for resource conflicts
Browse files Browse the repository at this point in the history
This allows to bias the depth of some SUnits in the hope of moving them
up or down in the scoreboard and avoid resource conflicts without
necessarily increasing the latency of the whole region.
  • Loading branch information
gbossu committed Oct 15, 2024
1 parent 3d06620 commit 717215e
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 31 deletions.
19 changes: 19 additions & 0 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,24 @@ class LockDelays : public ScheduleDAGMutation {
#undef DEBUG_TYPE
#define DEBUG_TYPE "machine-scheduler"

class BiasDepth : public ScheduleDAGMutation {
void apply(ScheduleDAGInstrs *DAG) override {
auto *Sched = static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
const AIE::BlockState &BS =
Sched->getInterBlock().getBlockState(DAG->getBB());

// It's important to iterate in topological order over SUnits, because
// all its successors will be marked as having a "dirty" depth.
for (SUnit &SU : DAG->SUnits) {
if (auto *It = BS.FixPoint.PerMIExtraDepth.find(SU.getInstr());
It != BS.FixPoint.PerMIExtraDepth.end()) {
unsigned NewDepth = std::max(0, int(SU.getDepth()) + It->second);
SU.setDepthToAtLeast(NewDepth);
}
}
};
};

class RegionEndEdges : public ScheduleDAGMutation {
void removeExitSUPreds(ScheduleDAGInstrs *DAG) {
SUnit &ExitSU = DAG->ExitSU;
Expand Down Expand Up @@ -614,6 +632,7 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) {
Mutations.emplace_back(std::make_unique<RegionEndEdges>());
Mutations.emplace_back(std::make_unique<MemoryEdges>());
Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
Mutations.emplace_back(std::make_unique<BiasDepth>());
}
return Mutations;
}
Expand Down
91 changes: 83 additions & 8 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,14 @@ static cl::opt<bool> LoopEpilogueAnalysis(
cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));

static cl::opt<int> MaxExpensiveIterations(
"aie-loop-aware-expensive-iterations", cl::init(25),
cl::desc("[AIE] Perform Loop/Epilogue analysis with loop scheduling"));
"aie-loop-aware-expensive-iterations", cl::init(35),
cl::desc("[AIE] Maximum iterations for fine-grained convergence in "
"iterative loop scheduling"));

static cl::opt<bool>
BiasDepth("aie-loop-aware-bias-depth", cl::init(true),
cl::desc("[AIE] Try to bias the depth for hazard avoidance in "
"iterative loop scheduling"));

static cl::opt<int> PostPipelinerMaxII(
"aie-postpipeliner-maxii", cl::init(10),
Expand Down Expand Up @@ -160,6 +166,40 @@ MachineInstr *checkResourceConflictsBottomUp(
return nullptr;
}

/// Replay the \p SuccBundles top-down into \p ScoreBoard.
/// If that causes a resource conflict, return an instruction
/// from \p SuccBundles that is responsible for it.
/// Note that \p Scoreboard will be modified.
///
/// \pre The bundles contain no multi-slot pseudo.
MachineInstr *
checkResourceConflictsTopDown(ResourceScoreboard<FuncUnitWrapper> &Scoreboard,
const std::vector<MachineBundle> &SuccBundles,
const AIEHazardRecognizer &HR,
const FixedpointState &Fixedpoint) {
DEBUG_LOOPAWARE(dbgs() << "Interblock Predecessor scoreboard:\n";
Scoreboard.dump());

int TopCycle = 0;
MachineInstr *ConflictMI = nullptr;
for (const MachineBundle &B : SuccBundles) {
for (MachineInstr *MI : B.getInstrs()) {
if (HR.getHazardType(Scoreboard, MI->getDesc(), HR.getMemoryBanks(MI),
MI->operands(), MI->getMF()->getRegInfo(), 0)) {
DEBUG_LOOPAWARE(dbgs() << "Conflicting MI at Top cycle=" << TopCycle
<< ": " << *MI);
ConflictMI = MI;
if (!Fixedpoint.PerMIExtraDepth.contains(MI))
return MI;
}
}
++TopCycle;
Scoreboard.advance();
}

return ConflictMI;
}

MachineBasicBlock *getLoopPredecessor(const MachineBasicBlock &MBB) {
if (MBB.pred_size() == 1) {
// if we have only one, it must be the loop
Expand Down Expand Up @@ -346,22 +386,33 @@ bool InterBlockScheduling::leaveBlock() {
return true;
}

MachineInstr *InterBlockScheduling::resourcesConverged(BlockState &BS) const {
MachineInstr *
InterBlockScheduling::resourcesConverged(BlockState &BS,
bool FindInBottomRegion) const {
assert(!BS.getRegions().empty());

// We are a single-block loop body. Check that there is no resource conflict
// on the backedge, by overlaying top and bottom region
if (MachineInstr *MICausingConflict = checkResourceConflictsBottomUp(
createBottomUpScoreboard(BS.getTop().Bundles, *HR),
BS.getBottom().Bundles, *HR))
return MICausingConflict;
if (FindInBottomRegion) {
if (MachineInstr *MICausingConflict = checkResourceConflictsBottomUp(
createBottomUpScoreboard(BS.getTop().Bundles, *HR),
BS.getBottom().Bundles, *HR))
return MICausingConflict;
}

// Bottom represents the resources that are sticking out of the block.
// The last non-empty cycle is a safe upperbound for the resource
// safety margin.
ResourceScoreboard<FuncUnitWrapper> Bottom =
createTopDownScoreboard(BS.getBottom().Bundles, *HR);
BS.FixPoint.MaxResourceExtent = Bottom.lastOccupied();

if (!FindInBottomRegion) {
if (MachineInstr *MICausingConflict = checkResourceConflictsTopDown(
Bottom, BS.getTop().Bundles, *HR, BS.FixPoint))
return MICausingConflict;
}

return nullptr;
}

Expand Down Expand Up @@ -491,12 +542,35 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
return BS.FixPoint.Stage = SchedulingStage::Scheduling;
}

if (MachineInstr *MINeedsHigherCap = resourcesConverged(BS);
// Before pushing BS.getBottom() instructions up to avoid resource hazards,
// try and bias the depth of some instructions in BS.getTop()
if (BiasDepth && BS.FixPoint.NumIters <= MaxExpensiveIterations) {
if (MachineInstr *MINeedsHigherCap =
resourcesConverged(BS, /*FindInBottomRegion=*/false);
InterBlockScoreboard && MINeedsHigherCap) {
auto Res = BS.FixPoint.PerMIExtraDepth.try_emplace(MINeedsHigherCap, 1);
int &ExtraDepth = Res.first->second;
if (ExtraDepth >= 0) {
if (!Res.second) // Depth was already biased, try a negative bias
ExtraDepth = -1;
DEBUG_LOOPAWARE(dbgs() << " not converged: resources ExtraDepth="
<< ExtraDepth << "\n");
// Iterate on CurMBB
return BS.FixPoint.Stage = SchedulingStage::Scheduling;
}
DEBUG_LOOPAWARE(dbgs() << " not converged: Depth biasing failed\n");
}
}

// If biasing did not help, actively push instructions from BS.getBottom() up.
if (MachineInstr *MINeedsHigherCap =
resourcesConverged(BS, /*FindInBottomRegion=*/true);
InterBlockScoreboard && MINeedsHigherCap) {
auto Res = BS.FixPoint.PerMILatencyMargin.try_emplace(MINeedsHigherCap, 0);
if (BS.FixPoint.NumIters <= MaxExpensiveIterations) {
++Res.first->second;
} else {
BS.FixPoint.PerMIExtraDepth.clear();
BS.FixPoint.ResourceMargin++;
}
DEBUG_LOOPAWARE(dbgs() << " not converged: resources RM="
Expand All @@ -506,6 +580,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
// Iterate on CurMBB
return BS.FixPoint.Stage = SchedulingStage::Scheduling;
}

DEBUG_LOOPAWARE(dbgs() << "Converged,"
<< " LatencyExtent=" << BS.FixPoint.MaxLatencyExtent
<< " ResourceExtent=" << BS.FixPoint.MaxResourceExtent
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AIE/AIEInterBlockScheduling.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class FixedpointState {
// Parameters of the loop-aware convergence
int LatencyMargin = 0;
SmallMapVector<MachineInstr *, int, 8> PerMILatencyMargin;
SmallMapVector<MachineInstr *, int, 8> PerMIExtraDepth;
int ResourceMargin = 0;
// The II of the modulo schedule we are trying.
int II = 0;
Expand Down Expand Up @@ -311,7 +312,10 @@ class InterBlockScheduling {

/// Return one instruction that needs to be moved higher to avoid a resource
/// conflict, or nullptr if all resources converged.
MachineInstr *resourcesConverged(BlockState &BS) const;
/// \param FindInBottomRegion Whether the conflicting instruction is searched
/// in the Bottom or Top region of \p BS.
MachineInstr *resourcesConverged(BlockState &BS,
bool FindInBottomRegion = true) const;

/// Return one instruction that needs a higher latency cap, or nullptr if all
/// latencies converged.
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: mova r0, #0 // Delay Slot 1
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_2: // %entry.new
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; nopx ; mov dc0, #0
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc0, #0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc4, dc0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1
Expand All @@ -111,30 +111,30 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; movx r6, #-4; vadd cm4, cm4, cm1, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; and r1, r1, r6; vadd cm6, cm6, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; add r1, r1, #-4; vadd cm1, cm7, cm5, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm4, cm4, cm1, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movx r6, #-4; vadd cm6, cm6, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; and r1, r1, r6; vadd cm1, cm7, cm5, r0
; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3; vadd cm8, cm3, cm0, r0
; ASM-NEXT: add r1, r1, #-4; mov s0, r5
; ASM-NEXT: jz r1, #.LBB0_5
; ASM-NEXT: mov s0, r5 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 2
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: nop // Delay Slot 2
; ASM-NEXT: nop // Delay Slot 1
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_3: // %for.body
; ASM-NEXT: // =>This Inner Loop Header: Depth=1
; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm5, cm5, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; vadd cm2, cm7, cm4, r0
; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; nops ; nopxm ; vadd cm5, cm5, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; nopb ; nopx ; vadd cm2, cm7, cm4, r0
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; vadd cm3, cm6, cm3, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; add r1, r1, #-4
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32; jnz r1, #.LBB0_3
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r1, r1, #-4
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; jnz r1, #.LBB0_3
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vst.srs.d8.s32 cm8, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm5, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: nop // Delay Slot 3
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm3, s0, [p3], #32 // Delay Slot 2
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm8, cm1, cm0, r0 // Delay Slot 1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; vst.srs.d8.s32 cm2, s0, [p3], #32 // Delay Slot 2
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vst.srs.d8.s32 cm3, s0, [p3], #32; vadd cm8, cm1, cm0, r0 // Delay Slot 1
; ASM-NEXT: // %bb.4:
; ASM-NEXT: nopa ; nopb ; nopxm
; ASM-NEXT: nop
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AIE/aie2/schedule/loopaware/Add2D-like.mir
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $cm0, $cm4, $dc0, $dc4, $dj0, $dj4, $dn0, $dn4, $m0, $m1, $p1, $p2, $p3, $r0, $r1, $s0, $s1, $d0_3d:0x000000000001C870
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit killed $cm4, implicit $r0 {
; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32)
; CHECK-NEXT: renamable $cm8 = VADD killed renamable $cm4, internal renamable $cm0, renamable $r0
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $r1 {
; CHECK-NEXT: BUNDLE implicit-def $cm4, implicit-def $bml4, implicit-def $amll4, implicit-def $amlh4, implicit-def $bmh4, implicit-def $amhl4, implicit-def $amhh4, implicit-def $p2, implicit-def $dc0, implicit-def $dc4, implicit-def $srups_of, implicit-def $cm8, implicit-def $bml8, implicit-def $amll8, implicit-def $amlh8, implicit-def $bmh8, implicit-def $amhl8, implicit-def $amhh8, implicit $s1, implicit killed $p2, implicit $d0_3d, implicit $crsat, implicit $crupssign, implicit killed $cm0, implicit $r0 {
; CHECK-NEXT: $cm4, $p2, $dc0, $dc4 = VLDA_3D_UPS_S32_D8 $s1, killed $p2, $d0_3d, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 64)
; CHECK-NEXT: renamable $cm8 = VADD internal renamable $cm4, killed renamable $cm0, renamable $r0
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $cm0, implicit-def $bml0, implicit-def $amll0, implicit-def $amlh0, implicit-def $bmh0, implicit-def $amhl0, implicit-def $amhh0, implicit-def $p1, implicit-def $srups_of, implicit-def $r1, implicit-def dead $srcarry, implicit $s1, implicit killed $p1, implicit $m1, implicit $crsat, implicit $crupssign, implicit killed $r1 {
; CHECK-NEXT: renamable $cm0, renamable $p1 = VLDA_UPS_S32_D8_ag_pstm_nrm renamable $s1, killed renamable $p1, renamable $m1, implicit-def $srups_of, implicit $crsat, implicit $crupssign :: (load (<32 x s8>) from stack - 32)
; CHECK-NEXT: renamable $r1 = ADD_add_r_ri killed renamable $r1, -4, implicit-def dead $srcarry
; CHECK-NEXT: }
; CHECK-NEXT: JNZ renamable $r1, %bb.1
Expand Down

0 comments on commit 717215e

Please sign in to comment.