Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode #94452

Merged
merged 2 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 0 additions & 103 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {

MachineBasicBlock *emitEndCf(MachineInstr &MI);

void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);

void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;

Expand Down Expand Up @@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
return SplitBB;
}

void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
MachineInstr &MI) {
MachineFunction &MF = *MBB->getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
bool IsWave32 = ST.isWave32();

if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
.addImm(MI.getOperand(0).getImm());
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
LIS->InsertMachineInstrInMaps(*InitMI);
}
MI.eraseFromParent();
return;
}

// Extract the thread count from an SGPR input and set EXEC accordingly.
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
//
// S_BFE_U32 count, input, {shift, 7}
// S_BFM_B64 exec, count, 0
// S_CMP_EQ_U32 count, 64
// S_CMOV_B64 exec, -1
Register InputReg = MI.getOperand(0).getReg();
MachineInstr *FirstMI = &*MBB->begin();
if (InputReg.isVirtual()) {
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
assert(DefInstr && DefInstr->isCopy());
if (DefInstr->getParent() == MBB) {
if (DefInstr != FirstMI) {
// If the `InputReg` is defined in current block, we also need to
// move that instruction to the beginning of the block.
DefInstr->removeFromParent();
MBB->insert(FirstMI, DefInstr);
if (LIS)
LIS->handleMove(*DefInstr);
} else {
// If first instruction is definition then move pointer after it.
FirstMI = &*std::next(FirstMI->getIterator());
}
}
}

// Insert instruction sequence at block beginning (before vector operations).
const DebugLoc DL = MI.getDebugLoc();
const unsigned WavefrontSize = ST.getWavefrontSize();
const unsigned Mask = (WavefrontSize << 1) - 1;
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
if (LV)
LV->recomputeForSingleDefVirtReg(InputReg);
auto BfmMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
.addReg(CountReg)
.addImm(0);
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
.addImm(WavefrontSize);
if (LV)
LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
auto CmovMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
Exec)
.addImm(-1);

if (!LIS) {
MI.eraseFromParent();
return;
}

LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();

LIS->InsertMachineInstrInMaps(*BfeMI);
LIS->InsertMachineInstrInMaps(*BfmMI);
LIS->InsertMachineInstrInMaps(*CmpMI);
LIS->InsertMachineInstrInMaps(*CmovMI);

RecomputeRegs.insert(InputReg);
LIS->createAndComputeVirtRegInterval(CountReg);
}

bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
for (auto &I : MBB.instrs()) {
if (!I.isDebugInstr() && !I.isUnconditionalBranch())
Expand Down Expand Up @@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SplitMBB = process(MI);
Changed = true;
break;

// FIXME: find a better place for this
case AMDGPU::SI_INIT_EXEC:
case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
lowerInitExec(MBB, MI);
if (LIS)
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
Changed = true;
break;

default:
break;
}

if (SplitMBB != MBB) {
Expand Down
103 changes: 102 additions & 1 deletion llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;

void printInfo();

Expand Down Expand Up @@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerLiveMaskQueries();
void lowerCopyInstrs();
void lowerKillInstrs(bool IsWQM);
void lowerInitExec(MachineInstr &MI);
void lowerInitExecInstrs();

public:
static char ID;
Expand Down Expand Up @@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
Expand Down Expand Up @@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
}
}

void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();

if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
Exec)
.addImm(MI.getOperand(0).getImm());
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
LIS->InsertMachineInstrInMaps(*InitMI);
}
MI.eraseFromParent();
return;
}

// Extract the thread count from an SGPR input and set EXEC accordingly.
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
//
// S_BFE_U32 count, input, {shift, 7}
// S_BFM_B64 exec, count, 0
// S_CMP_EQ_U32 count, 64
// S_CMOV_B64 exec, -1
Register InputReg = MI.getOperand(0).getReg();
MachineInstr *FirstMI = &*MBB->begin();
if (InputReg.isVirtual()) {
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
assert(DefInstr && DefInstr->isCopy());
if (DefInstr->getParent() == MBB) {
if (DefInstr != FirstMI) {
// If the `InputReg` is defined in current block, we also need to
// move that instruction to the beginning of the block.
DefInstr->removeFromParent();
MBB->insert(FirstMI, DefInstr);
if (LIS)
LIS->handleMove(*DefInstr);
} else {
// If first instruction is definition then move pointer after it.
FirstMI = &*std::next(FirstMI->getIterator());
}
}
}

// Insert instruction sequence at block beginning (before vector operations).
const DebugLoc DL = MI.getDebugLoc();
const unsigned WavefrontSize = ST->getWavefrontSize();
const unsigned Mask = (WavefrontSize << 1) - 1;
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
auto BfmMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
.addReg(CountReg)
.addImm(0);
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
.addImm(WavefrontSize);
auto CmovMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
Exec)
.addImm(-1);

if (!LIS) {
MI.eraseFromParent();
return;
}

LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();

LIS->InsertMachineInstrInMaps(*BfeMI);
LIS->InsertMachineInstrInMaps(*BfmMI);
LIS->InsertMachineInstrInMaps(*CmpMI);
LIS->InsertMachineInstrInMaps(*CmovMI);

LIS->removeInterval(InputReg);
LIS->createAndComputeVirtRegInterval(InputReg);
LIS->createAndComputeVirtRegInterval(CountReg);
}

void SIWholeQuadMode::lowerInitExecInstrs() {
for (MachineInstr *MI : InitExecInstrs)
lowerInitExec(*MI);
}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
Expand All @@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
StateTransition.clear();

ST = &MF.getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
// Shader is simple does not need any state changes or any complex lowering
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
LowerToMovInstrs.empty() && KillInstrs.empty()) {
lowerInitExecInstrs();
lowerLiveMaskQueries();
return !LiveMaskQueries.empty();
return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
}

lowerInitExecInstrs();

MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

Expand Down
Loading