Skip to content

Commit 58e44d3

Browse files
dstutttstellar
authored andcommitted
[AMDGPU] Enhance s_waitcnt insertion before barrier for gfx12 (#90595)
Code to determine if a waitcnt is required before a barrier instruction only considered S_BARRIER. gfx12 adds barrier_signal/wait so need to enhance the existing code to look for a barrier start (which is just an S_BARRIER for earlier architectures).
1 parent d1d7131 commit 58e44d3

File tree

4 files changed

+36
-1
lines changed

4 files changed

+36
-1
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1832,7 +1832,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18321832
// not, we need to ensure the subtarget is capable of backing off barrier
18331833
// instructions in case there are any outstanding memory operations that may
18341834
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1835-
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1835+
if (TII->isBarrierStart(MI.getOpcode()) &&
18361836
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
18371837
Wait = Wait.combined(
18381838
AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));

llvm/lib/Target/AMDGPU/SIInstrInfo.h

+11
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,17 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
908908
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
909909
}
910910

911+
// Check to see if opcode is for a barrier start. Pre gfx12 this is just the
912+
// S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
913+
// to check for the barrier start (S_BARRIER_SIGNAL*)
914+
bool isBarrierStart(unsigned Opcode) const {
915+
return Opcode == AMDGPU::S_BARRIER ||
916+
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
917+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
918+
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
919+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
920+
}
921+
911922
static bool doesNotReadTiedSource(const MachineInstr &MI) {
912923
return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
913924
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll

+2
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
9696
; VARIANT4-NEXT: s_wait_kmcnt 0x0
9797
; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2
9898
; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1]
99+
; VARIANT4-NEXT: s_wait_storecnt 0x0
99100
; VARIANT4-NEXT: s_barrier_signal -1
100101
; VARIANT4-NEXT: s_barrier_wait -1
101102
; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
@@ -142,6 +143,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
142143
; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
143144
; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0
144145
; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1]
146+
; VARIANT6-NEXT: s_wait_storecnt 0x0
145147
; VARIANT6-NEXT: s_barrier_signal -1
146148
; VARIANT6-NEXT: s_barrier_wait -1
147149
; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll

+22
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
1212
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
1313
; GCN-NEXT: s_wait_kmcnt 0x0
1414
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
15+
; GCN-NEXT: s_wait_storecnt 0x0
1516
; GCN-NEXT: s_barrier_signal -1
1617
; GCN-NEXT: s_barrier_wait -1
1718
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -28,6 +29,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
2829
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
2930
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
3031
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
32+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
3133
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
3234
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
3335
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -56,6 +58,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
5658
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
5759
; GCN-NEXT: s_wait_kmcnt 0x0
5860
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
61+
; GCN-NEXT: s_wait_storecnt 0x0
5962
; GCN-NEXT: s_barrier_signal 1
6063
; GCN-NEXT: s_barrier_wait 1
6164
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -72,6 +75,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
7275
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
7376
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
7477
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
78+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
7579
; GLOBAL-ISEL-NEXT: s_barrier_signal 1
7680
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
7781
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -100,6 +104,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
100104
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
101105
; GCN-NEXT: s_wait_kmcnt 0x0
102106
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
107+
; GCN-NEXT: s_wait_storecnt 0x0
103108
; GCN-NEXT: s_barrier_signal 0
104109
; GCN-NEXT: s_barrier_wait 0
105110
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -116,6 +121,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
116121
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
117122
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
118123
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
124+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
119125
; GLOBAL-ISEL-NEXT: s_barrier_signal 0
120126
; GLOBAL-ISEL-NEXT: s_barrier_wait 0
121127
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -146,6 +152,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
146152
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
147153
; GCN-NEXT: s_wait_kmcnt 0x0
148154
; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
155+
; GCN-NEXT: s_wait_storecnt 0x0
149156
; GCN-NEXT: s_barrier_signal m0
150157
; GCN-NEXT: s_barrier_wait 1
151158
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -163,6 +170,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
163170
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
164171
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
165172
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
173+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
166174
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
167175
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
168176
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -192,6 +200,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
192200
; GCN-NEXT: v_readfirstlane_b32 s0, v0
193201
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
194202
; GCN-NEXT: s_mov_b32 m0, s0
203+
; GCN-NEXT: s_wait_storecnt 0x0
195204
; GCN-NEXT: s_barrier_signal m0
196205
; GCN-NEXT: s_setpc_b64 s[30:31]
197206
;
@@ -203,6 +212,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
203212
; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0
204213
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
205214
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
215+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
206216
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
207217
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
208218
call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
@@ -216,6 +226,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
216226
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
217227
; GCN-NEXT: s_wait_kmcnt 0x0
218228
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
229+
; GCN-NEXT: s_wait_storecnt 0x0
219230
; GCN-NEXT: s_barrier_signal_isfirst -1
220231
; GCN-NEXT: s_cselect_b32 s3, s3, s5
221232
; GCN-NEXT: s_cselect_b32 s2, s2, s4
@@ -235,6 +246,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
235246
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
236247
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
237248
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
249+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
238250
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
239251
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
240252
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -270,6 +282,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
270282
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
271283
; GCN-NEXT: s_wait_kmcnt 0x0
272284
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
285+
; GCN-NEXT: s_wait_storecnt 0x0
273286
; GCN-NEXT: s_barrier_signal_isfirst 1
274287
; GCN-NEXT: s_cselect_b32 s3, s3, s5
275288
; GCN-NEXT: s_cselect_b32 s2, s2, s4
@@ -289,6 +302,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
289302
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
290303
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
291304
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
305+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
292306
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
293307
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
294308
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -324,6 +338,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
324338
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
325339
; GCN-NEXT: s_wait_kmcnt 0x0
326340
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
341+
; GCN-NEXT: s_wait_storecnt 0x0
327342
; GCN-NEXT: s_barrier_signal_isfirst 1
328343
; GCN-NEXT: s_cselect_b32 s3, s3, s5
329344
; GCN-NEXT: s_cselect_b32 s2, s2, s4
@@ -343,6 +358,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p
343358
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
344359
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
345360
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
361+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
346362
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
347363
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
348364
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -379,6 +395,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
379395
; GCN-NEXT: s_mov_b32 m0, 1
380396
; GCN-NEXT: s_wait_kmcnt 0x0
381397
; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
398+
; GCN-NEXT: s_wait_storecnt 0x0
382399
; GCN-NEXT: s_barrier_signal_isfirst m0
383400
; GCN-NEXT: s_cselect_b32 s3, s3, s5
384401
; GCN-NEXT: s_cselect_b32 s2, s2, s4
@@ -399,6 +416,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %
399416
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
400417
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
401418
; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
419+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
402420
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
403421
; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
404422
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -444,6 +462,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
444462
; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
445463
; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
446464
; GCN-NEXT: global_store_b32 v[7:8], v10, off
465+
; GCN-NEXT: s_wait_storecnt 0x0
447466
; GCN-NEXT: s_barrier_signal_isfirst m0
448467
; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0
449468
; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
@@ -470,6 +489,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
470489
; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
471490
; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0
472491
; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off
492+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
473493
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
474494
; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
475495
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1339,6 +1359,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
13391359
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
13401360
; GCN-NEXT: s_wait_kmcnt 0x0
13411361
; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
1362+
; GCN-NEXT: s_wait_storecnt 0x0
13421363
; GCN-NEXT: s_barrier_signal -1
13431364
; GCN-NEXT: s_barrier_wait -1
13441365
; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -1355,6 +1376,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
13551376
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
13561377
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
13571378
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
1379+
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
13581380
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
13591381
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
13601382
; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]

0 commit comments

Comments
 (0)