@@ -615,6 +615,138 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
615615 return false ;
616616}
617617
618+ GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy (
619+ const MachineSchedContext *C)
620+ : GCNSchedStrategy(C) {
621+ SchedStages.push_back (GCNSchedStageID::MemoryClauseInitialSchedule);
622+ }
623+
624+ // / GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
625+ // / much as possible. This is achieved by:
626+ // 1. Prioritize clustered operations before stall latency heuristic.
627+ // 2. Prioritize long-latency-load before stall latency heuristic.
628+ // /
629+ // / \param Cand provides the policy and current best candidate.
630+ // / \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
631+ // / \param Zone describes the scheduled zone that we are extending, or nullptr
632+ // / if Cand is from a different zone than TryCand.
633+ // / \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
634+ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate (SchedCandidate &Cand,
635+ SchedCandidate &TryCand,
636+ SchedBoundary *Zone) const {
637+ // Initialize the candidate if needed.
638+ if (!Cand.isValid ()) {
639+ TryCand.Reason = NodeOrder;
640+ return true ;
641+ }
642+
643+ // Bias PhysReg Defs and copies to their uses and defined respectively.
644+ if (tryGreater (biasPhysReg (TryCand.SU , TryCand.AtTop ),
645+ biasPhysReg (Cand.SU , Cand.AtTop ), TryCand, Cand, PhysReg))
646+ return TryCand.Reason != NoCand;
647+
648+ if (DAG->isTrackingPressure ()) {
649+ // Avoid exceeding the target's limit.
650+ if (tryPressure (TryCand.RPDelta .Excess , Cand.RPDelta .Excess , TryCand, Cand,
651+ RegExcess, TRI, DAG->MF ))
652+ return TryCand.Reason != NoCand;
653+
654+ // Avoid increasing the max critical pressure in the scheduled region.
655+ if (tryPressure (TryCand.RPDelta .CriticalMax , Cand.RPDelta .CriticalMax ,
656+ TryCand, Cand, RegCritical, TRI, DAG->MF ))
657+ return TryCand.Reason != NoCand;
658+ }
659+
660+ // MaxMemoryClause-specific: We prioritize clustered instructions as we would
661+ // get more benefit from clausing these memory instructions.
662+ const SUnit *CandNextClusterSU =
663+ Cand.AtTop ? DAG->getNextClusterSucc () : DAG->getNextClusterPred ();
664+ const SUnit *TryCandNextClusterSU =
665+ TryCand.AtTop ? DAG->getNextClusterSucc () : DAG->getNextClusterPred ();
666+ if (tryGreater (TryCand.SU == TryCandNextClusterSU,
667+ Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
668+ return TryCand.Reason != NoCand;
669+
670+ // We only compare a subset of features when comparing nodes between
671+ // Top and Bottom boundary. Some properties are simply incomparable, in many
672+ // other instances we should only override the other boundary if something
673+ // is a clear good pick on one boundary. Skip heuristics that are more
674+ // "tie-breaking" in nature.
675+ bool SameBoundary = Zone != nullptr ;
676+ if (SameBoundary) {
677+ // For loops that are acyclic path limited, aggressively schedule for
678+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
679+ // heuristics to take precedence.
680+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps () &&
681+ tryLatency (TryCand, Cand, *Zone))
682+ return TryCand.Reason != NoCand;
683+
684+ // MaxMemoryClause-specific: Prioritize long latency memory load
685+ // instructions in top-bottom order to hide more latency. The mayLoad check
686+ // is used to exclude store-like instructions, which we do not want to
687+ // scheduler them too early.
688+ bool TryMayLoad =
689+ TryCand.SU ->isInstr () && TryCand.SU ->getInstr ()->mayLoad ();
690+ bool CandMayLoad = Cand.SU ->isInstr () && Cand.SU ->getInstr ()->mayLoad ();
691+
692+ if (TryMayLoad || CandMayLoad) {
693+ bool TryLongLatency =
694+ TryCand.SU ->Latency > 10 * Cand.SU ->Latency && TryMayLoad;
695+ bool CandLongLatency =
696+ 10 * TryCand.SU ->Latency < Cand.SU ->Latency && CandMayLoad;
697+
698+ if (tryGreater (Zone->isTop () ? TryLongLatency : CandLongLatency,
699+ Zone->isTop () ? CandLongLatency : TryLongLatency, TryCand,
700+ Cand, Stall))
701+ return TryCand.Reason != NoCand;
702+ }
703+ // Prioritize instructions that read unbuffered resources by stall cycles.
704+ if (tryLess (Zone->getLatencyStallCycles (TryCand.SU ),
705+ Zone->getLatencyStallCycles (Cand.SU ), TryCand, Cand, Stall))
706+ return TryCand.Reason != NoCand;
707+ }
708+
709+ if (SameBoundary) {
710+ // Weak edges are for clustering and other constraints.
711+ if (tryLess (getWeakLeft (TryCand.SU , TryCand.AtTop ),
712+ getWeakLeft (Cand.SU , Cand.AtTop ), TryCand, Cand, Weak))
713+ return TryCand.Reason != NoCand;
714+ }
715+
716+ // Avoid increasing the max pressure of the entire region.
717+ if (DAG->isTrackingPressure () &&
718+ tryPressure (TryCand.RPDelta .CurrentMax , Cand.RPDelta .CurrentMax , TryCand,
719+ Cand, RegMax, TRI, DAG->MF ))
720+ return TryCand.Reason != NoCand;
721+
722+ if (SameBoundary) {
723+ // Avoid critical resource consumption and balance the schedule.
724+ TryCand.initResourceDelta (DAG, SchedModel);
725+ if (tryLess (TryCand.ResDelta .CritResources , Cand.ResDelta .CritResources ,
726+ TryCand, Cand, ResourceReduce))
727+ return TryCand.Reason != NoCand;
728+ if (tryGreater (TryCand.ResDelta .DemandedResources ,
729+ Cand.ResDelta .DemandedResources , TryCand, Cand,
730+ ResourceDemand))
731+ return TryCand.Reason != NoCand;
732+
733+ // Avoid serializing long latency dependence chains.
734+ // For acyclic path limited loops, latency was already checked above.
735+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy .ReduceLatency &&
736+ !Rem.IsAcyclicLatencyLimited && tryLatency (TryCand, Cand, *Zone))
737+ return TryCand.Reason != NoCand;
738+
739+ // Fall through to original instruction order.
740+ if (Zone->isTop () == (TryCand.SU ->NodeNum < Cand.SU ->NodeNum )) {
741+ assert (TryCand.SU ->NodeNum != Cand.SU ->NodeNum );
742+ TryCand.Reason = NodeOrder;
743+ return true ;
744+ }
745+ }
746+
747+ return false ;
748+ }
749+
618750GCNScheduleDAGMILive::GCNScheduleDAGMILive (
619751 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
620752 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -644,6 +776,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
644776 return std::make_unique<PreRARematStage>(SchedStageID, *this );
645777 case GCNSchedStageID::ILPInitialSchedule:
646778 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this );
779+ case GCNSchedStageID::MemoryClauseInitialSchedule:
780+ return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
781+ *this );
647782 }
648783
649784 llvm_unreachable (" Unknown SchedStageID." );
@@ -869,6 +1004,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
8691004 case GCNSchedStageID::ILPInitialSchedule:
8701005 OS << " Max ILP Initial Schedule" ;
8711006 break ;
1007+ case GCNSchedStageID::MemoryClauseInitialSchedule:
1008+ OS << " Max memory clause Initial Schedule" ;
1009+ break ;
8721010 }
8731011
8741012 return OS;
@@ -1088,7 +1226,8 @@ void GCNSchedStage::setupNewBlock() {
10881226 // Get real RP for the region if it hasn't be calculated before. After the
10891227 // initial schedule stage real RP will be collected after scheduling.
10901228 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1091- StageID == GCNSchedStageID::ILPInitialSchedule)
1229+ StageID == GCNSchedStageID::ILPInitialSchedule ||
1230+ StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
10921231 DAG.computeBlockPressure (RegionIdx, CurrentMBB);
10931232}
10941233
@@ -1389,6 +1528,11 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
13891528 return false ;
13901529}
13911530
1531+ bool MemoryClauseInitialScheduleStage::shouldRevertScheduling (
1532+ unsigned WavesAfter) {
1533+ return mayCauseSpilling (WavesAfter);
1534+ }
1535+
13921536bool GCNSchedStage::mayCauseSpilling (unsigned WavesAfter) {
13931537 if (WavesAfter <= MFI.getMinWavesPerEU () && isRegionWithExcessRP () &&
13941538 !PressureAfter.less (MF, PressureBefore)) {
0 commit comments