Skip to content

Commit fefcb44

Browse files
committed
eliminated double VALUNum increment in advanceByNum function
1 parent d76f4a1 commit fefcb44

16 files changed

+110
-147
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,17 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
6161
return false;
6262
}
6363

64-
static bool instructionWaitsForSALUWrites(const MachineInstr &MI) {
64+
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
6565
// These instruction types wait for VA_SDST==0 before issuing.
6666
// S_CBRANCH_EXECZ and S_CBRANCH_VCCZ are covered by SALU flag
6767
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::EXP |
6868
SIInstrFlags::DS | SIInstrFlags::SMRD |
6969
SIInstrFlags::MIMG | SIInstrFlags::VIMAGE |
7070
SIInstrFlags::VSAMPLE;
71-
71+
7272
if (MI.getDesc().TSFlags & VA_SDST_0)
7373
return true;
74+
7475
return false;
7576
}
7677

@@ -254,12 +255,15 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
254255
}
255256
}
256257

257-
void advanceByNum(DelayType Type, unsigned Cycles, unsigned VALUNum) {
258+
void advanceByNum(DelayType Type, unsigned Cycles, unsigned SGPRWriteVALUNum) {
258259
iterator Next;
259260
for (auto I = begin(), E = end(); I != E; I = Next) {
260261
Next = std::next(I);
261-
if (I->second.VALUNum >= VALUNum && I->second.advance(Type, Cycles))
262+
if (I->second.VALUNum >= SGPRWriteVALUNum && I->second.VALUCycles > 0){
262263
erase(I);
264+
265+
266+
}
263267
}
264268
}
265269

@@ -382,14 +386,15 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
382386

383387
DelayType Type = getDelayType(MI.getDesc().TSFlags);
384388

385-
if (instructionWaitsForSALUWrites(MI)) {
389+
if (instructionWaitsForSGPRWrites(MI)) {
386390
auto It = State.find(lastSGPRfromVALU);
387391
if (It != State.end()) {
388392
DelayInfo Info = It->getSecond();
389393
State.advanceByNum(VALU, Info.VALUCycles, Info.VALUNum);
390394
lastSGPRfromVALU = 0;
391395
}
392396
}
397+
393398

394399
if (instructionWaitsForVALU(MI)) {
395400
// Forget about all outstanding VALU delays.

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

-2
Original file line numberDiff line numberDiff line change
@@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
28542854
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
28552855
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
28562856
; GFX12-NEXT: s_wait_alu 0xfffd
2857-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
28582857
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
28592858
; GFX12-NEXT: flat_store_b32 v[0:1], v3
28602859
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
38423841
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
38433842
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
38443843
; GFX12-NEXT: s_wait_alu 0xfffd
3845-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
38463844
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
38473845
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
38483846
; GFX12-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll

+48-48
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

+7-11
Original file line numberDiff line numberDiff line change
@@ -1072,14 +1072,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
10721072
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10731073
; GFX12-NEXT: v_mov_b32_e32 v2, v11
10741074
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1075-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1075+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10761076
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
10771077
; GFX12-NEXT: s_wait_alu 0xf1ff
10781078
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
10791079
; GFX12-NEXT: s_wait_alu 0xfffd
10801080
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
1081-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10821081
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
1082+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
10831083
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
10841084
; GFX12-NEXT: s_wait_alu 0xf1fd
10851085
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2435,12 +2435,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24352435
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
24362436
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
24372437
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2438-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
2438+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
24392439
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
24402440
; GFX12-NEXT: s_wait_alu 0xf1ff
24412441
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
24422442
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2443-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
24442443
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
24452444
; GFX12-NEXT: s_wait_alu 0xfffd
24462445
; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
@@ -2449,31 +2448,29 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24492448
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
24502449
; GFX12-NEXT: s_wait_alu 0xfffd
24512450
; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2452-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24532451
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
24542452
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
24552453
; GFX12-NEXT: s_wait_alu 0xfffd
24562454
; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
2457-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
24582455
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2456+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
24592457
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
24602458
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
24612459
; GFX12-NEXT: s_wait_alu 0xf1ff
24622460
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
2463-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
24642461
; GFX12-NEXT: v_mov_b32_e32 v20, v22
24652462
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
24662463
; GFX12-NEXT: s_wait_alu 0xfffd
24672464
; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
24682465
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
2469-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24702466
; GFX12-NEXT: v_mov_b32_e32 v19, v22
24712467
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
2468+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
24722469
; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
24732470
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
24742471
; GFX12-NEXT: v_mov_b32_e32 v20, v18
2475-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
24762472
; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
2473+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
24772474
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
24782475
; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
24792476
; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
@@ -2515,10 +2512,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
25152512
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
25162513
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
25172514
; GFX12-NEXT: s_wait_alu 0xfffd
2518-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
25192515
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
2516+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25202517
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
2521-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
25222518
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
25232519
; GFX12-NEXT: s_wait_alu 0xf1fd
25242520
; GFX12-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

+8-18
Original file line numberDiff line numberDiff line change
@@ -2145,12 +2145,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
21452145
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
21462146
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
21472147
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
2148-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2148+
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
21492149
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
21502150
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
21512151
; GFX1164-NEXT: s_mov_b32 s2, -1
21522152
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
2153-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
21542153
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
21552154
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
21562155
; GFX1164-NEXT: s_endpgm
@@ -2189,12 +2188,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
21892188
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
21902189
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
21912190
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
2192-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2191+
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
21932192
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
21942193
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
21952194
; GFX1132-NEXT: s_mov_b32 s2, -1
21962195
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
2197-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
21982196
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
21992197
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
22002198
; GFX1132-NEXT: s_endpgm
@@ -2232,7 +2230,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
22322230
; GFX1264-NEXT: s_wait_kmcnt 0x0
22332231
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
22342232
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
2235-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2233+
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
22362234
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
22372235
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
22382236
; GFX1264-NEXT: s_mov_b32 s2, -1
@@ -2272,7 +2270,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
22722270
; GFX1232-NEXT: s_wait_kmcnt 0x0
22732271
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
22742272
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
2275-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2273+
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
22762274
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
22772275
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
22782276
; GFX1232-NEXT: s_mov_b32 s2, -1
@@ -3244,7 +3242,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
32443242
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
32453243
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
32463244
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
3247-
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3245+
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
32483246
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
32493247
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
32503248
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -3329,7 +3327,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
33293327
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
33303328
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
33313329
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
3332-
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3330+
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
33333331
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
33343332
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
33353333
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
@@ -4068,7 +4066,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
40684066
; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0
40694067
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
40704068
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
4071-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
40724069
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
40734070
; GFX1164-NEXT: s_mov_b32 s2, -1
40744071
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4105,7 +4102,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
41054102
; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0
41064103
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
41074104
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
4108-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
41094105
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
41104106
; GFX1132-NEXT: s_mov_b32 s2, -1
41114107
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4144,7 +4140,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
41444140
; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
41454141
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
41464142
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
4147-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
41484143
; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
41494144
; GFX1264-NEXT: s_mov_b32 s2, -1
41504145
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4182,7 +4177,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
41824177
; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0
41834178
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
41844179
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
4185-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
41864180
; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
41874181
; GFX1232-NEXT: s_mov_b32 s2, -1
41884182
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -5716,7 +5710,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
57165710
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
57175711
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
57185712
; GFX1164-NEXT: s_mov_b32 s2, -1
5719-
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
57205713
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc
57215714
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
57225715
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5761,7 +5754,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
57615754
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
57625755
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
57635756
; GFX1132-NEXT: s_mov_b32 s2, -1
5764-
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
57655757
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
57665758
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
57675759
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5805,7 +5797,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
58055797
; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
58065798
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3
58075799
; GFX1264-NEXT: s_mov_b32 s2, -1
5808-
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
58095800
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
58105801
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
58115802
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -5848,7 +5839,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
58485839
; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
58495840
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
58505841
; GFX1232-NEXT: s_mov_b32 s2, -1
5851-
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
58525842
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
58535843
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
58545844
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -6818,7 +6808,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
68186808
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
68196809
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
68206810
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
6821-
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6811+
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
68226812
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
68236813
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
68246814
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -6903,7 +6893,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
69036893
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
69046894
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
69056895
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
6906-
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6896+
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
69076897
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
69086898
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
69096899
; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo

0 commit comments

Comments
 (0)