Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e1b1edc

Browse files
mihajlovicanafrederik-h
authored andcommittedMar 18, 2025
Reland "[AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (llvm#127212)" (llvm#131111)
We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation.
1 parent d5149eb commit e1b1edc

File tree

94 files changed

+920
-1299
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+920
-1299
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

+38
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ class AMDGPUInsertDelayAlu {
4747
return false;
4848
}
4949

50+
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51+
// These instruction types wait for VA_SDST==0 before issuing.
52+
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
53+
54+
return MI.getDesc().TSFlags & VA_SDST_0;
55+
}
56+
5057
// Types of delay that can be encoded in an s_delay_alu instruction.
5158
enum DelayType { VALU, TRANS, SALU, OTHER };
5259

@@ -227,6 +234,16 @@ class AMDGPUInsertDelayAlu {
227234
}
228235
}
229236

237+
void advanceByVALUNum(unsigned VALUNum) {
238+
iterator Next;
239+
for (auto I = begin(), E = end(); I != E; I = Next) {
240+
Next = std::next(I);
241+
if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
242+
erase(I);
243+
}
244+
}
245+
}
246+
230247
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
231248
void dump(const TargetRegisterInfo *TRI) const {
232249
if (empty()) {
@@ -331,6 +348,7 @@ class AMDGPUInsertDelayAlu {
331348
bool Changed = false;
332349
MachineInstr *LastDelayAlu = nullptr;
333350

351+
MCRegUnit LastSGPRFromVALU = 0;
334352
// Iterate over the contents of bundles, but don't emit any instructions
335353
// inside a bundle.
336354
for (auto &MI : MBB.instrs()) {
@@ -345,6 +363,15 @@ class AMDGPUInsertDelayAlu {
345363

346364
DelayType Type = getDelayType(MI.getDesc().TSFlags);
347365

366+
if (instructionWaitsForSGPRWrites(MI)) {
367+
auto It = State.find(LastSGPRFromVALU);
368+
if (It != State.end()) {
369+
DelayInfo Info = It->getSecond();
370+
State.advanceByVALUNum(Info.VALUNum);
371+
LastSGPRFromVALU = 0;
372+
}
373+
}
374+
348375
if (instructionWaitsForVALU(MI)) {
349376
// Forget about all outstanding VALU delays.
350377
// TODO: This is overkill since it also forgets about SALU delays.
@@ -368,6 +395,17 @@ class AMDGPUInsertDelayAlu {
368395
}
369396
}
370397
}
398+
399+
if (SII->isVALU(MI.getOpcode())) {
400+
for (const auto &Op : MI.defs()) {
401+
Register Reg = Op.getReg();
402+
if (AMDGPU::isSGPR(Reg, TRI)) {
403+
LastSGPRFromVALU = *TRI->regunits(Reg).begin();
404+
break;
405+
}
406+
}
407+
}
408+
371409
if (Emit && !MI.isBundledWithPred()) {
372410
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
373411
// just ignore them?

‎llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

-2
Original file line numberDiff line numberDiff line change
@@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28542854
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
28552855
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
28562856
; GFX12-NEXT: s_wait_alu 0xfffd
2857-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
28582857
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
28592858
; GFX12-NEXT: flat_store_b32 v[0:1], v3
28602859
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38423841
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
38433842
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
38443843
; GFX12-NEXT: s_wait_alu 0xfffd
3845-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
38463844
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
38473845
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
38483846
; GFX12-NEXT: s_endpgm

0 commit comments

Comments
 (0)
Please sign in to comment.