diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16dadb4b41..d5d3787d51c8c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -22,6 +22,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPURewriteAGPRCopyMFMA.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -39,6 +41,72 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma"
 
+namespace llvm {
+namespace AMDGPU {
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot) {
+  SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+  for (MachineInstr *S : Stores)
+    if (MDT.isReachableFromEntry(S->getParent()))
+      StoreBlocks.insert(S->getParent());
+
+  if (StoreBlocks.empty())
+    return false;
+
+  // Compute blocks reachable from entry without passing through a store
+  // block.
+  SmallPtrSet<MachineBasicBlock *, 16> StoreFreeReachable;
+  SmallVector<MachineBasicBlock *, 16> Worklist;
+
+  MachineBasicBlock &EntryMBB = const_cast<MachineBasicBlock &>(MF.front());
+  Worklist.push_back(&EntryMBB);
+  StoreFreeReachable.insert(&EntryMBB);
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    if (StoreBlocks.contains(MBB))
+      continue;
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (StoreFreeReachable.insert(Succ).second)
+        Worklist.push_back(Succ);
+    }
+  }
+
+  auto IsLoadJointlyDominatedByStores = [&](MachineInstr *LoadMI) -> bool {
+    MachineBasicBlock *LoadMBB = LoadMI->getParent();
+    if (!MDT.isReachableFromEntry(LoadMBB))
+      return true;
+
+    // Check if every path passed through a store block.
+    if (!StoreFreeReachable.contains(LoadMBB))
+      return true;
+
+    // Otherwise, there exists a path to this block that has not seen any
+    // store yet. We must ensure that within this block there is a store to
+    // this slot before the load.
+    for (MachineInstr &MI : *LoadMBB) {
+      if (&MI == LoadMI)
+        break;
+      if (MI.mayStore()) {
+        for (MachineOperand &MO : MI.operands()) {
+          if (MO.isFI() && MO.getIndex() == Slot)
+            return true;
+        }
+      }
+    }
+
+    return false;
+  };
+
+  return llvm::all_of(Loads, IsLoadJointlyDominatedByStores);
+}
+
+} // namespace AMDGPU
+} // namespace llvm
+
 namespace {
 
 STATISTIC(NumMFMAsRewrittenToAGPR,
@@ -58,6 +126,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   LiveIntervals &LIS;
   LiveStacks &LSS;
   const RegisterClassInfo &RegClassInfo;
+  MachineDominatorTree &MDT;
 
   bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
                                   MCPhysReg PrefPhysReg) const;
@@ -66,10 +135,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
                                 LiveRegMatrix &LRM, LiveIntervals &LIS,
                                 LiveStacks &LSS,
-                                const RegisterClassInfo &RegClassInfo)
+                                const RegisterClassInfo &RegClassInfo,
+                                MachineDominatorTree &MDT)
       : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
-        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
+        LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo), MDT(MDT) {}
 
   bool isRewriteCandidate(const MachineInstr &MI) const {
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -515,6 +585,37 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
     if (SpillReferences == SpillSlotReferences.end())
       continue;
 
+    // For each spill reload, every path from entry to the reload must pass
+    // through at least one spill store to the same stack slot.
+    SmallVector<MachineInstr *, 4> Stores, Loads;
+    Stores.reserve(SpillReferences->second.size());
+    Loads.reserve(SpillReferences->second.size());
+    for (MachineInstr *MI : SpillReferences->second) {
+      if (MI->mayStore())
+        Stores.push_back(MI);
+      else if (MI->mayLoad())
+        Loads.push_back(MI);
+    }
+
+    SmallPtrSet<MachineBasicBlock *, 4> StoreBlocks;
+    for (MachineInstr *S : Stores)
+      if (MDT.isReachableFromEntry(S->getParent()))
+        StoreBlocks.insert(S->getParent());
+
+    if (StoreBlocks.empty()) {
+      LLVM_DEBUG(dbgs() << "Skipping " << printReg(Slot, &TRI)
+                        << ": no reachable stores\n");
+      continue;
+    }
+
+    if (!AMDGPU::checkAGPRCopyMFMAJointDominance(MF, MDT, Stores, Loads,
+                                                 Slot)) {
+      LLVM_DEBUG(
+          dbgs() << "Skipping " << printReg(Slot, &TRI)
+                 << ": some reachable load not jointly dominated by stores\n");
+      continue;
+    }
+
     const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
 
     LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
@@ -603,11 +704,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
     AU.addRequired<VirtRegMapWrapperLegacy>();
     AU.addRequired<LiveRegMatrixWrapperLegacy>();
     AU.addRequired<LiveStacksWrapperLegacy>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
 
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<VirtRegMapWrapperLegacy>();
     AU.addPreserved<LiveRegMatrixWrapperLegacy>();
     AU.addPreserved<LiveStacksWrapperLegacy>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
 
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -622,6 +725,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
                     "AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
 
@@ -641,7 +745,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
   auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   return Impl.run(MF);
 }
 
@@ -652,10 +757,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
   LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
   RegisterClassInfo RegClassInfo;
   RegClassInfo.runOnMachineFunction(MF);
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
new file mode 100644
index 0000000000000..624860f8cb978
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.h
@@ -0,0 +1,32 @@
+//===- AMDGPURewriteAGPRCopyMFMA.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MachineFunction;
+class MachineDominatorTree;
+class MachineInstr;
+
+namespace AMDGPU {
+
+/// Returns true if every reload in \p Loads is jointly dominated by at least
+/// one store in \p Stores to the same stack slot \p Slot.
+bool checkAGPRCopyMFMAJointDominance(
+    const MachineFunction &MF, const MachineDominatorTree &MDT,
+    const SmallVectorImpl<MachineInstr *> &Stores,
+    const SmallVectorImpl<MachineInstr *> &Loads, int Slot);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUREWRITEAGPRCOPYMFMA_H
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp
new file mode 100644
index 0000000000000..5e2ca64224146
--- /dev/null
+++ b/llvm/unittests/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMATest.cpp
@@ -0,0 +1,196 @@
+#include "AMDGPURewriteAGPRCopyMFMA.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPUUnitTests.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPURewriteAGPRCopyMFMATest : public testing::Test {
+protected:
+  std::unique_ptr<LLVMContext> Context;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<const TargetMachine> TM;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<MachineDominatorTree> MDT;
+
+  void SetUp() override {
+    TM = createAMDGPUTargetMachine("amdgcn-amd-amdhsa", "gfx90a", "");
+    if (!TM)
+      return;
+
+    Context = std::make_unique<LLVMContext>();
+    M = std::make_unique<Module>("TestModule", *Context);
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+  }
+
+  void createMachineFunction() {
+    M->setDataLayout(TM->createDataLayout());
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Context), false);
+    Function *F = Function::Create(FTy, Function::ExternalLinkage, "test", *M);
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           MMI->getContext(), 0);
+  }
+};
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, JointDominanceDiamond) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Create CFG:
+  // Entry -> StoreBB
+  // Entry -> NoStoreBB
+  // StoreBB -> MergeBB
+  // NoStoreBB -> MergeBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NoStoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *MergeBB = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(StoreBB);
+  MF->push_back(NoStoreBB);
+  MF->push_back(MergeBB);
+
+  Entry->addSuccessor(StoreBB);
+  Entry->addSuccessor(NoStoreBB);
+
+  StoreBB->addSuccessor(MergeBB);
+
+  NoStoreBB->addSuccessor(MergeBB);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *StoreMI =
+      BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM))
+          .addExternalSymbol("store")
+          .addImm(1 /* SideEffects */);
+
+  MachineInstr *LoadMI = BuildMI(MergeBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_FALSE(Result);
+}
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, LoadBeforeStoreInLoop) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Entry -> LoopBB -> Exit
+  // LoopBB -> LoopBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *Exit = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(LoopBB);
+  MF->push_back(Exit);
+
+  Entry->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(Exit);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *LoadMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  MachineInstr *StoreMI = BuildMI(LoopBB, DL, TII->get(TargetOpcode::INLINEASM))
+                              .addExternalSymbol("store")
+                              .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_FALSE(Result);
+}
+
+TEST_F(AMDGPURewriteAGPRCopyMFMATest, DominatedByPredecessor) {
+  if (!TM)
+    return;
+  createMachineFunction();
+
+  // Entry -> StoreBB -> LoadBB
+
+  MachineBasicBlock *Entry = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *StoreBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *LoadBB = MF->CreateMachineBasicBlock();
+
+  MF->push_back(Entry);
+  MF->push_back(StoreBB);
+  MF->push_back(LoadBB);
+
+  Entry->addSuccessor(StoreBB);
+  StoreBB->addSuccessor(LoadBB);
+
+  MDT = std::make_unique<MachineDominatorTree>();
+  MDT->recalculate(*MF);
+
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+
+  MachineInstr *StoreMI =
+      BuildMI(StoreBB, DL, TII->get(TargetOpcode::INLINEASM))
+          .addExternalSymbol("store")
+          .addImm(1);
+
+  MachineInstr *LoadMI = BuildMI(LoadBB, DL, TII->get(TargetOpcode::INLINEASM))
+                             .addExternalSymbol("load")
+                             .addImm(1);
+
+  int Slot = MF->getFrameInfo().CreateSpillStackObject(4, Align(4));
+  StoreMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+  LoadMI->addOperand(*MF, MachineOperand::CreateFI(Slot));
+
+  SmallVector<MachineInstr *, 4> Stores = {StoreMI};
+  SmallVector<MachineInstr *, 4> Loads = {LoadMI};
+
+  bool Result =
+      AMDGPU::checkAGPRCopyMFMAJointDominance(*MF, *MDT, Stores, Loads, Slot);
+  EXPECT_TRUE(Result);
+}
+
+} // namespace
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index d6cbaf3f3fb5d..57476db34d1f6 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -25,4 +25,5 @@ add_llvm_target_unittest(AMDGPUTests
   ExecMayBeModifiedBeforeAnyUse.cpp
   LiveRegUnits.cpp
   PALMetadata.cpp
+  AMDGPURewriteAGPRCopyMFMATest.cpp
   )