diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 89c16dadb4b41..d5d3787d51c8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -22,6 +22,7 @@ /// //===----------------------------------------------------------------------===// +#include "AMDGPURewriteAGPRCopyMFMA.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" @@ -30,6 +31,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/VirtRegMap.h" @@ -39,6 +41,72 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma" +namespace llvm { +namespace AMDGPU { +bool checkAGPRCopyMFMAJointDominance( + const MachineFunction &MF, const MachineDominatorTree &MDT, + const SmallVectorImpl &Stores, + const SmallVectorImpl &Loads, int Slot) { + SmallPtrSet StoreBlocks; + for (MachineInstr *S : Stores) + if (MDT.isReachableFromEntry(S->getParent())) + StoreBlocks.insert(S->getParent()); + + if (StoreBlocks.empty()) + return false; + + // Compute blocks reachable from entry without passing through a store + // block. + SmallPtrSet StoreFreeReachable; + SmallVector Worklist; + + MachineBasicBlock &EntryMBB = const_cast(MF.front()); + Worklist.push_back(&EntryMBB); + StoreFreeReachable.insert(&EntryMBB); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + if (StoreBlocks.contains(MBB)) + continue; + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (StoreFreeReachable.insert(Succ).second) + Worklist.push_back(Succ); + } + } + + auto IsLoadJointlyDominatedByStores = [&](MachineInstr *LoadMI) -> bool { + MachineBasicBlock *LoadMBB = LoadMI->getParent(); + if (!MDT.isReachableFromEntry(LoadMBB)) + return true; + + // Check if every path passed through a store block. + if (!StoreFreeReachable.contains(LoadMBB)) + return true; + + // Otherwise, there exists a path to this block that has not seen any + // store yet. We must ensure that within this block there is a store to + // this slot before the load. + for (MachineInstr &MI : *LoadMBB) { + if (&MI == LoadMI) + break; + if (MI.mayStore()) { + for (MachineOperand &MO : MI.operands()) { + if (MO.isFI() && MO.getIndex() == Slot) + return true; + } + } + } + + return false; + }; + + return llvm::all_of(Loads, IsLoadJointlyDominatedByStores); +} + +} // namespace AMDGPU +} // namespace llvm + namespace { STATISTIC(NumMFMAsRewrittenToAGPR, @@ -58,6 +126,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl { LiveIntervals &LIS; LiveStacks &LSS; const RegisterClassInfo &RegClassInfo; + MachineDominatorTree &MDT; bool attemptReassignmentsToAGPR(SmallSetVector &InterferingRegs, MCPhysReg PrefPhysReg) const; @@ -66,10 +135,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl { AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, LiveStacks &LSS, - const RegisterClassInfo &RegClassInfo) + const RegisterClassInfo &RegClassInfo, + MachineDominatorTree &MDT) : MF(MF), ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), - LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {} + LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo), MDT(MDT) {} bool isRewriteCandidate(const MachineInstr &MI) const { return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; @@ -515,6 +585,37 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const { if (SpillReferences == SpillSlotReferences.end()) continue; + // For each spill reload, every path from entry to the reload must pass + // through at least one spill store to the same stack slot. + SmallVector Stores, Loads; + Stores.reserve(SpillReferences->second.size()); + Loads.reserve(SpillReferences->second.size()); + for (MachineInstr *MI : SpillReferences->second) { + if (MI->mayStore()) + Stores.push_back(MI); + else if (MI->mayLoad()) + Loads.push_back(MI); + } + + SmallPtrSet StoreBlocks; + for (MachineInstr *S : Stores) + if (MDT.isReachableFromEntry(S->getParent())) + StoreBlocks.insert(S->getParent()); + + if (StoreBlocks.empty()) { + LLVM_DEBUG(dbgs() << "Skipping " << printReg(Slot, &TRI) + << ": no reachable stores\n"); + continue; + } + + if (!AMDGPU::checkAGPRCopyMFMAJointDominance(MF, MDT, Stores, Loads, + Slot)) { + LLVM_DEBUG( + dbgs() << "Skipping " << printReg(Slot, &TRI) + << ": some reachable load not jointly dominated by stores\n"); + continue; + } + const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot); LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI) @@ -603,11 +704,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); @@ -622,6 +725,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE, "AMDGPU Rewrite AGPR-Copy-MFMA", false, false) @@ -641,7 +745,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction( auto &LRM = getAnalysis().getLRM(); auto &LIS = getAnalysis().getLIS(); auto &LSS = getAnalysis().getLS(); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo); + auto &MDT = getAnalysis().getDomTree(); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT); return Impl.run(MF); } @@ -652,10 +757,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, LiveRegMatrix &LRM = MFAM.getResult(MF); LiveIntervals &LIS = MFAM.getResult(MF); LiveStacks &LSS = MFAM.getResult(MF); + MachineDominatorTree &MDT = MFAM.getResult(MF); RegisterClassInfo RegClassInfo; RegClassInfo.runOnMachineFunction(MF); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT); if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h new file mode 100644 index 0000000000000..624860f8cb978 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h @@ -0,0 +1,32 @@ +//===- AMDGPURewriteAGPRCopyMFMA.h ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H + +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + +class MachineFunction; +class MachineDominatorTree; +class MachineInstr; + +namespace AMDGPU { + +/// Returns true if every reload in \p Loads is jointly dominated by at least +/// one store in \p Stores to the same stack slot \p Slot. +bool checkAGPRCopyMFMAJointDominance( + const MachineFunction &MF, const MachineDominatorTree &MDT, + const SmallVectorImpl &Stores, + const SmallVectorImpl &Loads, int Slot); + +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H diff --git a/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp new file mode 100644 index 0000000000000..5e2ca64224146 --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp @@ -0,0 +1,196 @@ +#include "AMDGPURewriteAGPRCopyMFMA.h" +#include "AMDGPUTargetMachine.h" +#include "AMDGPUUnitTests.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class AMDGPURewriteAGPRCopyMFMATest : public testing::Test { +protected: + std::unique_ptr Context; + std::unique_ptr M; + std::unique_ptr TM; + std::unique_ptr MMI; + std::unique_ptr MF; + std::unique_ptr MDT; + + void SetUp() override { + TM = createAMDGPUTargetMachine("amdgcn-amd-amdhsa", "gfx90a", ""); + if (!TM) + return; + + Context = std::make_unique(); + M = std::make_unique("TestModule", *Context); + MMI = std::make_unique(TM.get()); + } + + void createMachineFunction() { + M->setDataLayout(TM->createDataLayout()); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Context), false); + Function *F = Function::Create(FTy, Function::ExternalLinkage, "test", *M); + MF = std::make_unique(*F, *TM, *TM->getSubtargetImpl(*F), + MMI->getContext(), 0); + } +}; + +TEST_F(AMDGPURewriteAGPRCopyMFMATest, JointDominanceDiamond) { + if (!TM) + return; + createMachineFunction(); + + // Create CFG: + // Entry -> StoreBB + // Entry -> NoStoreBB + // StoreBB -> MergeBB + // NoStoreBB -> MergeBB + + MachineBasicBlock *Entry = MF->CreateMachineBasicBlock(); + MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *NoStoreBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *MergeBB = MF->CreateMachineBasicBlock(); + + MF->push_back(Entry); + MF->push_back(StoreBB); + MF->push_back(NoStoreBB); + MF->push_back(MergeBB); + + Entry->addSuccessor(StoreBB); + Entry->addSuccessor(NoStoreBB); + + StoreBB->addSuccessor(MergeBB); + + NoStoreBB->addSuccessor(MergeBB); + + MDT = std::make_unique(); + MDT->recalculate(*MF); + + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + DebugLoc DL; + + MachineInstr *StoreMI = + BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("store") + .addImm(1 /* SideEffects */); + + MachineInstr *LoadMI = BuildMI(MergeBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("load") + .addImm(1); + + int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4)); + StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + + SmallVector Stores = {StoreMI}; + SmallVector Loads = {LoadMI}; + + bool Result = + AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot); + EXPECT_FALSE(Result); +} + +TEST_F(AMDGPURewriteAGPRCopyMFMATest, LoadBeforeStoreInLoop) { + if (!TM) + return; + createMachineFunction(); + + // Entry -> LoopBB -> Exit + // LoopBB -> LoopBB + + MachineBasicBlock *Entry = MF->CreateMachineBasicBlock(); + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *Exit = MF->CreateMachineBasicBlock(); + + MF->push_back(Entry); + MF->push_back(LoopBB); + MF->push_back(Exit); + + Entry->addSuccessor(LoopBB); + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(Exit); + + MDT = std::make_unique(); + MDT->recalculate(*MF); + + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + DebugLoc DL; + + MachineInstr *LoadMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("load") + .addImm(1); + + MachineInstr *StoreMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("store") + .addImm(1); + + int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4)); + StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + + SmallVector Stores = {StoreMI}; + SmallVector Loads = {LoadMI}; + + bool Result = + AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot); + EXPECT_FALSE(Result); +} + +TEST_F(AMDGPURewriteAGPRCopyMFMATest, DominatedByPredecessor) { + if (!TM) + return; + createMachineFunction(); + + // Entry -> StoreBB -> LoadBB + + MachineBasicBlock *Entry = MF->CreateMachineBasicBlock(); + MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *LoadBB = MF->CreateMachineBasicBlock(); + + MF->push_back(Entry); + MF->push_back(StoreBB); + MF->push_back(LoadBB); + + Entry->addSuccessor(StoreBB); + StoreBB->addSuccessor(LoadBB); + + MDT = std::make_unique(); + MDT->recalculate(*MF); + + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + DebugLoc DL; + + MachineInstr *StoreMI = + BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("store") + .addImm(1); + + MachineInstr *LoadMI = BuildMI(LoadBB, DL, TII->get(TargetOpcode::INLINEASM)) + .addExternalSymbol("load") + .addImm(1); + + int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4)); + StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot)); + + SmallVector Stores = {StoreMI}; + SmallVector Loads = {LoadMI}; + + bool Result = + AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot); + EXPECT_TRUE(Result); +} + +} // namespace diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt index d6cbaf3f3fb5d..57476db34d1f6 100644 --- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -25,4 +25,5 @@ add_llvm_target_unittest(AMDGPUTests ExecMayBeModifiedBeforeAnyUse.cpp LiveRegUnits.cpp PALMetadata.cpp + AMDGPURewriteAGPRCopyMFMATest.cpp )