[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

jayfoad · 2024-06-05T09:49:32Z

NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT
instructions a little longer so that we can reliably identify them in
SIWholeQuadMode.

…adMode NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT instructions a little longer so that we can reliably identify them in SIWholeQuadMode.

llvmbot · 2024-06-05T09:50:09Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes

NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT
instructions a little longer so that we can reliably identify them in
SIWholeQuadMode.

Full diff: https://github.com/llvm/llvm-project/pull/94452.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (-103)
(modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+102-1)

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe24..5dc3457b5bfae 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {
 
   MachineBasicBlock *emitEndCf(MachineInstr &MI);
 
-  void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
   return SplitBB;
 }
 
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
-                                       MachineInstr &MI) {
-  MachineFunction &MF = *MBB->getParent();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  bool IsWave32 = ST.isWave32();
-
-  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
-    // This should be before all vector instructions.
-    MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-            TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
-        .addImm(MI.getOperand(0).getImm());
-    if (LIS) {
-      LIS->RemoveMachineInstrFromMaps(MI);
-      LIS->InsertMachineInstrInMaps(*InitMI);
-    }
-    MI.eraseFromParent();
-    return;
-  }
-
-  // Extract the thread count from an SGPR input and set EXEC accordingly.
-  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
-  //
-  // S_BFE_U32 count, input, {shift, 7}
-  // S_BFM_B64 exec, count, 0
-  // S_CMP_EQ_U32 count, 64
-  // S_CMOV_B64 exec, -1
-  Register InputReg = MI.getOperand(0).getReg();
-  MachineInstr *FirstMI = &*MBB->begin();
-  if (InputReg.isVirtual()) {
-    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
-    assert(DefInstr && DefInstr->isCopy());
-    if (DefInstr->getParent() == MBB) {
-      if (DefInstr != FirstMI) {
-        // If the `InputReg` is defined in current block, we also need to
-        // move that instruction to the beginning of the block.
-        DefInstr->removeFromParent();
-        MBB->insert(FirstMI, DefInstr);
-        if (LIS)
-          LIS->handleMove(*DefInstr);
-      } else {
-        // If first instruction is definition then move pointer after it.
-        FirstMI = &*std::next(FirstMI->getIterator());
-      }
-    }
-  }
-
-  // Insert instruction sequence at block beginning (before vector operations).
-  const DebugLoc DL = MI.getDebugLoc();
-  const unsigned WavefrontSize = ST.getWavefrontSize();
-  const unsigned Mask = (WavefrontSize << 1) - 1;
-  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
-                   .addReg(InputReg)
-                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-  if (LV)
-    LV->recomputeForSingleDefVirtReg(InputReg);
-  auto BfmMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
-          .addReg(CountReg)
-          .addImm(0);
-  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
-                   .addReg(CountReg, RegState::Kill)
-                   .addImm(WavefrontSize);
-  if (LV)
-    LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
-  auto CmovMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-              Exec)
-          .addImm(-1);
-
-  if (!LIS) {
-    MI.eraseFromParent();
-    return;
-  }
-
-  LIS->RemoveMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
-
-  LIS->InsertMachineInstrInMaps(*BfeMI);
-  LIS->InsertMachineInstrInMaps(*BfmMI);
-  LIS->InsertMachineInstrInMaps(*CmpMI);
-  LIS->InsertMachineInstrInMaps(*CmovMI);
-
-  RecomputeRegs.insert(InputReg);
-  LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   for (auto &I : MBB.instrs()) {
     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         SplitMBB = process(MI);
         Changed = true;
         break;
-
-      // FIXME: find a better place for this
-      case AMDGPU::SI_INIT_EXEC:
-      case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
-        lowerInitExec(MBB, MI);
-        if (LIS)
-          LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
-        Changed = true;
-        break;
-
-      default:
-        break;
       }
 
       if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..f57faa86e90ca 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
+  SmallVector<MachineInstr *, 4> InitExecInstrs;
 
   void printInfo();
 
@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void lowerLiveMaskQueries();
   void lowerCopyInstrs();
   void lowerKillInstrs(bool IsWQM);
+  void lowerInitExec(MachineInstr &MI);
+  void lowerInitExecInstrs();
 
 public:
   static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                  Opcode == AMDGPU::SI_DEMOTE_I1) {
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
+      } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+        InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
         // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
   }
 }
 
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  bool IsWave32 = ST->isWave32();
+
+  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+    // This should be before all vector instructions.
+    MachineInstr *InitMI =
+        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+                Exec)
+            .addImm(MI.getOperand(0).getImm());
+    if (LIS) {
+      LIS->RemoveMachineInstrFromMaps(MI);
+      LIS->InsertMachineInstrInMaps(*InitMI);
+    }
+    MI.eraseFromParent();
+    return;
+  }
+
+  // Extract the thread count from an SGPR input and set EXEC accordingly.
+  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+  //
+  // S_BFE_U32 count, input, {shift, 7}
+  // S_BFM_B64 exec, count, 0
+  // S_CMP_EQ_U32 count, 64
+  // S_CMOV_B64 exec, -1
+  Register InputReg = MI.getOperand(0).getReg();
+  MachineInstr *FirstMI = &*MBB->begin();
+  if (InputReg.isVirtual()) {
+    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+    assert(DefInstr && DefInstr->isCopy());
+    if (DefInstr->getParent() == MBB) {
+      if (DefInstr != FirstMI) {
+        // If the `InputReg` is defined in current block, we also need to
+        // move that instruction to the beginning of the block.
+        DefInstr->removeFromParent();
+        MBB->insert(FirstMI, DefInstr);
+        if (LIS)
+          LIS->handleMove(*DefInstr);
+      } else {
+        // If first instruction is definition then move pointer after it.
+        FirstMI = &*std::next(FirstMI->getIterator());
+      }
+    }
+  }
+
+  // Insert instruction sequence at block beginning (before vector operations).
+  const DebugLoc DL = MI.getDebugLoc();
+  const unsigned WavefrontSize = ST->getWavefrontSize();
+  const unsigned Mask = (WavefrontSize << 1) - 1;
+  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+                   .addReg(InputReg)
+                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+  auto BfmMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+          .addReg(CountReg)
+          .addImm(0);
+  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+                   .addReg(CountReg, RegState::Kill)
+                   .addImm(WavefrontSize);
+  auto CmovMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+              Exec)
+          .addImm(-1);
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*BfeMI);
+  LIS->InsertMachineInstrInMaps(*BfmMI);
+  LIS->InsertMachineInstrInMaps(*CmpMI);
+  LIS->InsertMachineInstrInMaps(*CmovMI);
+
+  LIS->removeInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+  for (MachineInstr *MI : InitExecInstrs)
+    lowerInitExec(*MI);
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
                     << " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToCopyInstrs.clear();
   LowerToMovInstrs.clear();
   KillInstrs.clear();
+  InitExecInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   // Shader is simple does not need any state changes or any complex lowering
   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
-      LowerToMovInstrs.empty() && KillInstrs.empty()) {
+      LowerToMovInstrs.empty() && KillInstrs.empty() &&
+      InitExecInstrs.empty()) {
     lowerLiveMaskQueries();
     return !LiveMaskQueries.empty();
   }
 
+  lowerInitExecInstrs();
+
   MachineBasicBlock &Entry = MF.front();
   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

jayfoad · 2024-06-05T09:50:10Z

This is split out from #93680.

perlfu · 2024-06-05T12:00:31Z

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

@@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {

  // Shader is simple does not need any state changes or any complex lowering
  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
-      LowerToMovInstrs.empty() && KillInstrs.empty()) {
+      LowerToMovInstrs.empty() && KillInstrs.empty() &&
+      InitExecInstrs.empty()) {


Could you just add lowerInitExecInstrs() to this code path and avoid the empty check.
This way there is no need to run the rest of the pass for just an SI_INIT_EXEC?

Done. But really, as a future cleanup, I'd prefer to remove this fast path. Instead we should make sure that the "slow" path below is actually fast when all the Instrs vectors are empty, and returns an accurate "Changed" flag.

perlfu

LGTM

This is a straight forward relocation of the existing code.

Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in llvm#94452.

…8333) Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in #94452.

…vm#98333) Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in llvm#94452.

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQu…

5f7ecf7

…adMode NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT instructions a little longer so that we can reliably identify them in SIWholeQuadMode.

llvmbot added the backend:AMDGPU label Jun 5, 2024

jayfoad requested review from arsenm, perlfu and piotrAMD June 5, 2024 09:49

perlfu reviewed Jun 5, 2024

View reviewed changes

Add lowerInitExecInstrs to fast path

543cc94

perlfu approved these changes Jun 6, 2024

View reviewed changes

jayfoad merged commit 4c6dd70 into llvm:main Jun 6, 2024
7 checks passed

jayfoad deleted the move-init-exec branch June 6, 2024 09:29

jayfoad mentioned this pull request Jun 7, 2024

[AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec #93680

Merged

jayfoad mentioned this pull request Jul 10, 2024

[AMDGPU] Fix machine verification failure from INIT_EXEC lowering #98333

Merged

jayfoad added a commit that referenced this pull request Jul 11, 2024

[AMDGPU] Fix machine verification failure from INIT_EXEC lowering (#9…

d4e46f0

…8333) Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in #94452.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

jayfoad commented Jun 5, 2024

llvmbot commented Jun 5, 2024

jayfoad commented Jun 5, 2024

perlfu Jun 5, 2024

jayfoad Jun 5, 2024

perlfu left a comment

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

Conversation

jayfoad commented Jun 5, 2024

llvmbot commented Jun 5, 2024

jayfoad commented Jun 5, 2024

perlfu Jun 5, 2024

Choose a reason for hiding this comment

jayfoad Jun 5, 2024

Choose a reason for hiding this comment

perlfu left a comment

Choose a reason for hiding this comment