Skip to content

Commit 71582c6

Browse files
[AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (llvm#127212)
We have a VALU->SGPR->SALU (VALU writing to SGPR and SALU reading from it). When VALU is issued, it increments internal counter VA_SDST used to track use of this SGPR. SALU will not issue until VA_SDST is zero, that is when VALU is finished writing. Therefore, delays added by s_delay_alu are not needed in this situation.
1 parent ec941a4 commit 71582c6

File tree

93 files changed

+916
-1287
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+916
-1287
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

+38
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ class AMDGPUInsertDelayAlu {
4747
return false;
4848
}
4949

50+
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51+
// These instruction types wait for VA_SDST==0 before issuing.
52+
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
53+
54+
return MI.getDesc().TSFlags & VA_SDST_0;
55+
}
56+
5057
// Types of delay that can be encoded in an s_delay_alu instruction.
5158
enum DelayType { VALU, TRANS, SALU, OTHER };
5259

@@ -227,6 +234,16 @@ class AMDGPUInsertDelayAlu {
227234
}
228235
}
229236

237+
void advanceByVALUNum(unsigned VALUNum) {
238+
iterator Next;
239+
for (auto I = begin(), E = end(); I != E; I = Next) {
240+
Next = std::next(I);
241+
if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
242+
erase(I);
243+
}
244+
}
245+
}
246+
230247
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
231248
void dump(const TargetRegisterInfo *TRI) const {
232249
if (empty()) {
@@ -331,6 +348,7 @@ class AMDGPUInsertDelayAlu {
331348
bool Changed = false;
332349
MachineInstr *LastDelayAlu = nullptr;
333350

351+
MCRegUnit LastSGPRFromVALU = 0;
334352
// Iterate over the contents of bundles, but don't emit any instructions
335353
// inside a bundle.
336354
for (auto &MI : MBB.instrs()) {
@@ -345,6 +363,15 @@ class AMDGPUInsertDelayAlu {
345363

346364
DelayType Type = getDelayType(MI.getDesc().TSFlags);
347365

366+
if (instructionWaitsForSGPRWrites(MI)) {
367+
auto It = State.find(LastSGPRFromVALU);
368+
if (It != State.end()) {
369+
DelayInfo Info = It->getSecond();
370+
State.advanceByVALUNum(Info.VALUNum);
371+
LastSGPRFromVALU = 0;
372+
}
373+
}
374+
348375
if (instructionWaitsForVALU(MI)) {
349376
// Forget about all outstanding VALU delays.
350377
// TODO: This is overkill since it also forgets about SALU delays.
@@ -368,6 +395,17 @@ class AMDGPUInsertDelayAlu {
368395
}
369396
}
370397
}
398+
399+
if (SII->isVALU(MI.getOpcode())) {
400+
for (const auto &Op : MI.defs()) {
401+
Register Reg = Op.getReg();
402+
if (AMDGPU::isSGPR(Reg, TRI)) {
403+
LastSGPRFromVALU = *TRI->regunits(Reg).begin();
404+
break;
405+
}
406+
}
407+
}
408+
371409
if (Emit && !MI.isBundledWithPred()) {
372410
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
373411
// just ignore them?

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

-2
Original file line numberDiff line numberDiff line change
@@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28542854
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
28552855
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
28562856
; GFX12-NEXT: s_wait_alu 0xfffd
2857-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
28582857
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
28592858
; GFX12-NEXT: flat_store_b32 v[0:1], v3
28602859
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38423841
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
38433842
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
38443843
; GFX12-NEXT: s_wait_alu 0xfffd
3845-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
38463844
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
38473845
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
38483846
; GFX12-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll

+48-48
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
234234
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
235235
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
236236
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
237-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
238237
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
238+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
239239
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
240240
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7]
241241
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -360,8 +360,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
360360
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
361361
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
362362
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
363-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
364363
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
364+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
365365
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
366366
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
367367
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -476,8 +476,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
476476
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
477477
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
478478
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
479-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
480479
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
480+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
481481
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
482482
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
483483
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -604,8 +604,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
604604
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
605605
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
606606
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
607-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
608607
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
608+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
609609
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
610610
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16
611611
; GFX11-NEXT: ; implicit-def: $vgpr4

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll

-1
Original file line numberDiff line numberDiff line change
@@ -1468,7 +1468,6 @@ define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
14681468
; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e]
14691469
; GFX11-NEXT: ;;#ASMSTART
14701470
; GFX11-NEXT: ;;#ASMEND
1471-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
14721471
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9]
14731472
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
14741473
call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

+4-14
Original file line numberDiff line numberDiff line change
@@ -1072,12 +1072,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
10721072
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10731073
; GFX12-NEXT: v_mov_b32_e32 v2, v11
10741074
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1075-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1075+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10761076
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
10771077
; GFX12-NEXT: s_wait_alu 0xf1ff
10781078
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
10791079
; GFX12-NEXT: s_wait_alu 0xfffd
1080-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10811080
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
10821081
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
10831082
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2436,39 +2435,33 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24362435
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
24372436
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
24382437
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2439-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2438+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
24402439
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
24412440
; GFX12-NEXT: s_wait_alu 0xf1ff
24422441
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
24432442
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2444-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24452443
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
24462444
; GFX12-NEXT: s_wait_alu 0xfffd
24472445
; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
24482446
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
2449-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24502447
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
24512448
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
24522449
; GFX12-NEXT: s_wait_alu 0xfffd
24532450
; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2454-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24552451
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
24562452
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
24572453
; GFX12-NEXT: s_wait_alu 0xfffd
2458-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24592454
; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
24602455
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2461-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
2456+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
24622457
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
24632458
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
24642459
; GFX12-NEXT: s_wait_alu 0xf1ff
24652460
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
24662461
; GFX12-NEXT: v_mov_b32_e32 v20, v22
2467-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24682462
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
24692463
; GFX12-NEXT: s_wait_alu 0xfffd
24702464
; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
2471-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24722465
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
24732466
; GFX12-NEXT: v_mov_b32_e32 v19, v22
24742467
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2490,7 +2483,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24902483
; GFX12-NEXT: s_wait_alu 0xf1ff
24912484
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
24922485
; GFX12-NEXT: v_mov_b32_e32 v14, v21
2493-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24942486
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
24952487
; GFX12-NEXT: s_wait_alu 0xf1ff
24962488
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
@@ -2504,7 +2496,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
25042496
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
25052497
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
25062498
; GFX12-NEXT: s_wait_alu 0xf1ff
2507-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
25082499
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
25092500
; GFX12-NEXT: s_wait_alu 0xf1ff
25102501
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2521,10 +2512,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
25212512
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
25222513
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
25232514
; GFX12-NEXT: s_wait_alu 0xfffd
2524-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25252515
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
2516+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25262517
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
2527-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
25282518
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
25292519
; GFX12-NEXT: s_wait_alu 0xf1fd
25302520
; GFX12-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)