Skip to content

Commit aa008e0

Browse files
Revert "[AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (llvm#127212)"
This reverts commit 71582c6. Multiple buildbot failures have been reported: llvm#127212
1 parent 598e882 commit aa008e0

File tree

93 files changed

+1287
-916
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1287
-916
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

-38
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,6 @@ class AMDGPUInsertDelayAlu {
4747
return false;
4848
}
4949

50-
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51-
// These instruction types wait for VA_SDST==0 before issuing.
52-
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
53-
54-
return MI.getDesc().TSFlags & VA_SDST_0;
55-
}
56-
5750
// Types of delay that can be encoded in an s_delay_alu instruction.
5851
enum DelayType { VALU, TRANS, SALU, OTHER };
5952

@@ -234,16 +227,6 @@ class AMDGPUInsertDelayAlu {
234227
}
235228
}
236229

237-
void advanceByVALUNum(unsigned VALUNum) {
238-
iterator Next;
239-
for (auto I = begin(), E = end(); I != E; I = Next) {
240-
Next = std::next(I);
241-
if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
242-
erase(I);
243-
}
244-
}
245-
}
246-
247230
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
248231
void dump(const TargetRegisterInfo *TRI) const {
249232
if (empty()) {
@@ -348,7 +331,6 @@ class AMDGPUInsertDelayAlu {
348331
bool Changed = false;
349332
MachineInstr *LastDelayAlu = nullptr;
350333

351-
MCRegUnit LastSGPRFromVALU = 0;
352334
// Iterate over the contents of bundles, but don't emit any instructions
353335
// inside a bundle.
354336
for (auto &MI : MBB.instrs()) {
@@ -363,15 +345,6 @@ class AMDGPUInsertDelayAlu {
363345

364346
DelayType Type = getDelayType(MI.getDesc().TSFlags);
365347

366-
if (instructionWaitsForSGPRWrites(MI)) {
367-
auto It = State.find(LastSGPRFromVALU);
368-
if (It != State.end()) {
369-
DelayInfo Info = It->getSecond();
370-
State.advanceByVALUNum(Info.VALUNum);
371-
LastSGPRFromVALU = 0;
372-
}
373-
}
374-
375348
if (instructionWaitsForVALU(MI)) {
376349
// Forget about all outstanding VALU delays.
377350
// TODO: This is overkill since it also forgets about SALU delays.
@@ -395,17 +368,6 @@ class AMDGPUInsertDelayAlu {
395368
}
396369
}
397370
}
398-
399-
if (SII->isVALU(MI.getOpcode())) {
400-
for (const auto &Op : MI.defs()) {
401-
Register Reg = Op.getReg();
402-
if (AMDGPU::isSGPR(Reg, TRI)) {
403-
LastSGPRFromVALU = *TRI->regunits(Reg).begin();
404-
break;
405-
}
406-
}
407-
}
408-
409371
if (Emit && !MI.isBundledWithPred()) {
410372
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
411373
// just ignore them?

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

+2
Original file line numberDiff line numberDiff line change
@@ -2854,6 +2854,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28542854
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
28552855
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
28562856
; GFX12-NEXT: s_wait_alu 0xfffd
2857+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
28572858
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
28582859
; GFX12-NEXT: flat_store_b32 v[0:1], v3
28592860
; GFX12-NEXT: s_endpgm
@@ -3841,6 +3842,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38413842
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
38423843
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
38433844
; GFX12-NEXT: s_wait_alu 0xfffd
3845+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
38443846
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
38453847
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
38463848
; GFX12-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll

+48-48
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
234234
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
235235
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
236236
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
237+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
237238
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
238-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
239239
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
240240
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7]
241241
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -360,8 +360,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
360360
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
361361
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
362362
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
363+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
363364
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
364-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
365365
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
366366
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
367367
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -476,8 +476,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
476476
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
477477
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
478478
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
479+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
479480
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
480-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
481481
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
482482
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
483483
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -604,8 +604,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
604604
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
605605
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
606606
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
607+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
607608
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
608-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
609609
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
610610
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16
611611
; GFX11-NEXT: ; implicit-def: $vgpr4

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll

+1
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,7 @@ define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
14681468
; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e]
14691469
; GFX11-NEXT: ;;#ASMSTART
14701470
; GFX11-NEXT: ;;#ASMEND
1471+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
14711472
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9]
14721473
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
14731474
call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

+14-4
Original file line numberDiff line numberDiff line change
@@ -1072,11 +1072,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
10721072
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10731073
; GFX12-NEXT: v_mov_b32_e32 v2, v11
10741074
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1075-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1075+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10761076
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
10771077
; GFX12-NEXT: s_wait_alu 0xf1ff
10781078
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
10791079
; GFX12-NEXT: s_wait_alu 0xfffd
1080+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10801081
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
10811082
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
10821083
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2435,33 +2436,39 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24352436
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
24362437
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
24372438
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2438-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2439+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
24392440
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
24402441
; GFX12-NEXT: s_wait_alu 0xf1ff
24412442
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
24422443
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2444+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24432445
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
24442446
; GFX12-NEXT: s_wait_alu 0xfffd
24452447
; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
24462448
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
2449+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24472450
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
24482451
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
24492452
; GFX12-NEXT: s_wait_alu 0xfffd
24502453
; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2454+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24512455
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
24522456
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
24532457
; GFX12-NEXT: s_wait_alu 0xfffd
2458+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24542459
; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
24552460
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2456-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2461+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
24572462
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
24582463
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
24592464
; GFX12-NEXT: s_wait_alu 0xf1ff
24602465
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
24612466
; GFX12-NEXT: v_mov_b32_e32 v20, v22
2467+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24622468
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
24632469
; GFX12-NEXT: s_wait_alu 0xfffd
24642470
; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
2471+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24652472
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
24662473
; GFX12-NEXT: v_mov_b32_e32 v19, v22
24672474
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2483,6 +2490,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24832490
; GFX12-NEXT: s_wait_alu 0xf1ff
24842491
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
24852492
; GFX12-NEXT: v_mov_b32_e32 v14, v21
2493+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24862494
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
24872495
; GFX12-NEXT: s_wait_alu 0xf1ff
24882496
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
@@ -2496,6 +2504,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24962504
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
24972505
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
24982506
; GFX12-NEXT: s_wait_alu 0xf1ff
2507+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
24992508
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
25002509
; GFX12-NEXT: s_wait_alu 0xf1ff
25012510
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2512,9 +2521,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
25122521
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
25132522
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
25142523
; GFX12-NEXT: s_wait_alu 0xfffd
2515-
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
25162524
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2525+
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
25172526
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
2527+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
25182528
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
25192529
; GFX12-NEXT: s_wait_alu 0xf1fd
25202530
; GFX12-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)