From 5f7ecf7036a668ced623febab110f8620e17d863 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 29 May 2024 13:04:52 +0100 Subject: [PATCH 1/2] [AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT instructions a little longer so that we can reliably identify them in SIWholeQuadMode. --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 103 ------------------ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 103 +++++++++++++++++- 2 files changed, 102 insertions(+), 104 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f178324dbbe24..5dc3457b5bfae 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass { MachineBasicBlock *emitEndCf(MachineInstr &MI); - void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); - void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl &Src) const; @@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { return SplitBB; } -void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, - MachineInstr &MI) { - MachineFunction &MF = *MBB->getParent(); - const GCNSubtarget &ST = MF.getSubtarget(); - bool IsWave32 = ST.isWave32(); - - if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { - // This should be before all vector instructions. - MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) - .addImm(MI.getOperand(0).getImm()); - if (LIS) { - LIS->RemoveMachineInstrFromMaps(MI); - LIS->InsertMachineInstrInMaps(*InitMI); - } - MI.eraseFromParent(); - return; - } - - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - Register InputReg = MI.getOperand(0).getReg(); - MachineInstr *FirstMI = &*MBB->begin(); - if (InputReg.isVirtual()) { - MachineInstr *DefInstr = MRI->getVRegDef(InputReg); - assert(DefInstr && DefInstr->isCopy()); - if (DefInstr->getParent() == MBB) { - if (DefInstr != FirstMI) { - // If the `InputReg` is defined in current block, we also need to - // move that instruction to the beginning of the block. - DefInstr->removeFromParent(); - MBB->insert(FirstMI, DefInstr); - if (LIS) - LIS->handleMove(*DefInstr); - } else { - // If first instruction is definition then move pointer after it. - FirstMI = &*std::next(FirstMI->getIterator()); - } - } - } - - // Insert instruction sequence at block beginning (before vector operations). - const DebugLoc DL = MI.getDebugLoc(); - const unsigned WavefrontSize = ST.getWavefrontSize(); - const unsigned Mask = (WavefrontSize << 1) - 1; - Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); - auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - if (LV) - LV->recomputeForSingleDefVirtReg(InputReg); - auto BfmMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) - .addReg(CountReg) - .addImm(0); - auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(WavefrontSize); - if (LV) - LV->getVarInfo(CountReg).Kills.push_back(CmpMI); - auto CmovMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - - if (!LIS) { - MI.eraseFromParent(); - return; - } - - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - - LIS->InsertMachineInstrInMaps(*BfeMI); - LIS->InsertMachineInstrInMaps(*BfmMI); - LIS->InsertMachineInstrInMaps(*CmpMI); - LIS->InsertMachineInstrInMaps(*CmovMI); - - RecomputeRegs.insert(InputReg); - LIS->createAndComputeVirtRegInterval(CountReg); -} - bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { for (auto &I : MBB.instrs()) { if (!I.isDebugInstr() && !I.isUnconditionalBranch()) @@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { SplitMBB = process(MI); Changed = true; break; - - // FIXME: find a better place for this - case AMDGPU::SI_INIT_EXEC: - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: - lowerInitExec(MBB, MI); - if (LIS) - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - Changed = true; - break; - - default: - break; } if (SplitMBB != MBB) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 09dc1c781e2f3..f57faa86e90ca 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass { SmallVector LowerToMovInstrs; SmallVector LowerToCopyInstrs; SmallVector KillInstrs; + SmallVector InitExecInstrs; void printInfo(); @@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass { void lowerLiveMaskQueries(); void lowerCopyInstrs(); void lowerKillInstrs(bool IsWQM); + void lowerInitExec(MachineInstr &MI); + void lowerInitExecInstrs(); public: static char ID; @@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; + } else if (Opcode == AMDGPU::SI_INIT_EXEC || + Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) { + InitExecInstrs.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { } } +void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + bool IsWave32 = ST->isWave32(); + + if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { + // This should be before all vector instructions. + MachineInstr *InitMI = + BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + Exec) + .addImm(MI.getOperand(0).getImm()); + if (LIS) { + LIS->RemoveMachineInstrFromMaps(MI); + LIS->InsertMachineInstrInMaps(*InitMI); + } + MI.eraseFromParent(); + return; + } + + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + Register InputReg = MI.getOperand(0).getReg(); + MachineInstr *FirstMI = &*MBB->begin(); + if (InputReg.isVirtual()) { + MachineInstr *DefInstr = MRI->getVRegDef(InputReg); + assert(DefInstr && DefInstr->isCopy()); + if (DefInstr->getParent() == MBB) { + if (DefInstr != FirstMI) { + // If the `InputReg` is defined in current block, we also need to + // move that instruction to the beginning of the block. + DefInstr->removeFromParent(); + MBB->insert(FirstMI, DefInstr); + if (LIS) + LIS->handleMove(*DefInstr); + } else { + // If first instruction is definition then move pointer after it. + FirstMI = &*std::next(FirstMI->getIterator()); + } + } + } + + // Insert instruction sequence at block beginning (before vector operations). + const DebugLoc DL = MI.getDebugLoc(); + const unsigned WavefrontSize = ST->getWavefrontSize(); + const unsigned Mask = (WavefrontSize << 1) - 1; + Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + auto BfmMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) + .addReg(CountReg) + .addImm(0); + auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(WavefrontSize); + auto CmovMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) + .addImm(-1); + + if (!LIS) { + MI.eraseFromParent(); + return; + } + + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + + LIS->InsertMachineInstrInMaps(*BfeMI); + LIS->InsertMachineInstrInMaps(*BfmMI); + LIS->InsertMachineInstrInMaps(*CmpMI); + LIS->InsertMachineInstrInMaps(*CmovMI); + + LIS->removeInterval(InputReg); + LIS->createAndComputeVirtRegInterval(InputReg); + LIS->createAndComputeVirtRegInterval(CountReg); +} + +void SIWholeQuadMode::lowerInitExecInstrs() { + for (MachineInstr *MI : InitExecInstrs) + lowerInitExec(*MI); +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() << " ------------- \n"); @@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); KillInstrs.clear(); + InitExecInstrs.clear(); StateTransition.clear(); ST = &MF.getSubtarget(); @@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // Shader is simple does not need any state changes or any complex lowering if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && - LowerToMovInstrs.empty() && KillInstrs.empty()) { + LowerToMovInstrs.empty() && KillInstrs.empty() && + InitExecInstrs.empty()) { lowerLiveMaskQueries(); return !LiveMaskQueries.empty(); } + lowerInitExecInstrs(); + MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); From 543cc94d01a7c3e9d1cee256acaead1cd5c55272 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 5 Jun 2024 14:17:42 +0100 Subject: [PATCH 2/2] Add lowerInitExecInstrs to fast path --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index f57faa86e90ca..5b4c44302fa62 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1703,10 +1703,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // Shader is simple does not need any state changes or any complex lowering if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && - LowerToMovInstrs.empty() && KillInstrs.empty() && - InitExecInstrs.empty()) { + LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerInitExecInstrs(); lowerLiveMaskQueries(); - return !LiveMaskQueries.empty(); + return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); } lowerInitExecInstrs();