Skip to content

Commit

Permalink
[AMDGPU] Only emit SCOPE_SYS global_wb
Browse files Browse the repository at this point in the history
global_wb with scopes lower than SCOPE_SYS is unnecessary for correctness.

I was initially optimistic they would be very cheap no-ops but they can actually be quite expensive so let's avoid them.

Change-Id: I90dedc2dba6cc1be2d65896803f387903823270e
  • Loading branch information
Pierre-vh authored and jayfoad committed Oct 4, 2024
1 parent 55663b5 commit 5f65a9b
Show file tree
Hide file tree
Showing 38 changed files with 260 additions and 1,729 deletions.
334 changes: 126 additions & 208 deletions llvm/docs/AMDGPUUsage.rst

Large diffs are not rendered by default.

36 changes: 7 additions & 29 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2477,49 +2477,27 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;

// GLOBAL_WB is always needed, even for write-through caches, as it
// additionally ensures all operations have reached the desired cache level.
// global_wb is only necessary at system scope for gfx120x targets.
//
// Note that we can technically skip emission of SCOPE_SE writebacks for
// gfx120x as L1 is a buffer there (hence forwards all to L2), but we still
// emit them. The current strategy we use is to favor mirrorring SW semantics
// in the ISA whenever it is correct, and the performance cost is very low.
//
// This makes the memory model easier to understand, maintain, and also
// reduces the potential for bugs as it is sometimes difficult to anticipate
// all possible scenarios in which the WB will actually be needed.
bool SkipWB = false;
AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
switch (Scope) {
case SIAtomicScope::SYSTEM:
ScopeImm = AMDGPU::CPol::SCOPE_SYS;
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore we need to ensure all operations have reached L1,
// hence the SCOPE_SE WB.
// For CU mode, we need operations to reach L0, so the wait is enough -
// there are no ways for an operation to report completion without reaching
// at least L0.
if (ST.isCuModeEnabled())
SkipWB = true;
else
ScopeImm = AMDGPU::CPol::SCOPE_SE;
// No WB necessary, but we still have to wait.
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
// No cache to invalidate.
// No WB or wait necessary here.
return false;
default:
llvm_unreachable("Unsupported synchronization scope");
}

if (!SkipWB)
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);

if (Pos == Position::AFTER)
--MI;

Expand Down
21 changes: 3 additions & 18 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -91,7 +90,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -164,7 +162,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -241,7 +238,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -318,7 +314,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -465,7 +460,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -617,7 +611,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -774,7 +767,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
Expand Down Expand Up @@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
Expand Down Expand Up @@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down
21 changes: 3 additions & 18 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -91,7 +90,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -164,7 +162,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -241,7 +238,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
Expand Down Expand Up @@ -318,7 +314,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -465,7 +460,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -617,7 +611,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -774,7 +767,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
Expand Down Expand Up @@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
Expand Down Expand Up @@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down Expand Up @@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
Expand Down Expand Up @@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
Expand Down
Loading

0 comments on commit 5f65a9b

Please sign in to comment.