From 3e3aa57549e5df3a7b3610d1ebfb97ca5ddd0e18 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 31 Oct 2025 14:33:31 -0700 Subject: [PATCH 1/2] [AMDGPU][GlobalISel] Add RegBankLegalize support for G_FENCE --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 2 + llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll | 388 ++++++++++++++++++ .../memory-legalizer-atomic-fence.ll | 14 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll | 2 +- 4 files changed, 398 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 103cdec8233a0..fb768c9fd46a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll new file mode 100644 index 0000000000000..6cdcfe354c69b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll @@ -0,0 +1,388 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s + +@global_var = addrspace(1) global i32 0, align 4 + +define amdgpu_kernel void @fence_release(ptr addrspace(1) %ptr, i32 %val1, i32 %val2) { +; GFX9-LABEL: fence_release: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_release: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_release: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v2, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + store i32 %val1, ptr addrspace(1) %ptr + fence release + store volatile i32 %val2, ptr addrspace(1) @global_var + ret void +} + +define amdgpu_kernel void @fence_acquire(ptr addrspace(1) %ptr) { +; GFX9-LABEL: fence_acquire: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_acquire: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_acquire: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + fence acquire + %val = load i32, ptr addrspace(1) %ptr + store volatile i32 %val, ptr addrspace(1) @global_var + ret void +} + +define amdgpu_kernel void @fence_acq_rel(ptr addrspace(1) %ptr, i32 %val) { +; GFX9-LABEL: fence_acq_rel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_acq_rel: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_acq_rel: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + store i32 %val, ptr addrspace(1) %ptr + fence acq_rel + %load = load i32, ptr addrspace(1) %ptr + store volatile i32 %load, ptr addrspace(1) @global_var + ret void +} + +define amdgpu_kernel void @fence_seq_cst(ptr addrspace(1) %ptr, i32 %val) { +; GFX9-LABEL: fence_seq_cst: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_seq_cst: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_seq_cst: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + store i32 %val, ptr addrspace(1) %ptr + fence seq_cst + %load = load i32, ptr addrspace(1) %ptr + store volatile i32 %load, ptr addrspace(1) @global_var + ret void +} + +define amdgpu_kernel void @fence_workgroup_release(ptr addrspace(3) %ptr, i32 %val1, i32 %val2) { +; GFX9-LABEL: fence_workgroup_release: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_workgroup_release: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: ds_store_b32 v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v2, v3, s[4:5] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_workgroup_release: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_store_b32 v2, v3, s[4:5] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + store i32 %val1, ptr addrspace(3) %ptr + fence syncscope("workgroup") release + store volatile i32 %val2, ptr addrspace(1) @global_var + ret void +} + +define amdgpu_kernel void @fence_agent_seq_cst(ptr addrspace(1) %ptr, i32 %val) { +; GFX9-LABEL: fence_agent_seq_cst: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fence_agent_seq_cst: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fence_agent_seq_cst: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_endpgm + store i32 %val, ptr addrspace(1) %ptr + fence syncscope("agent") seq_cst + %load = load i32, ptr addrspace(1) %ptr + store volatile i32 %load, ptr addrspace(1) @global_var + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index e86f7473363f7..37b5422be7e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9f6fe81..61a61376d7ddd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 From 546df5bea07b301b0368b19a2948178362085e59 Mon Sep 17 00:00:00 2001 From: Vang Thao Date: Fri, 31 Oct 2025 20:15:25 -0700 Subject: [PATCH 2/2] Remove redundant test --- llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll | 388 ------------------- 1 file changed, 388 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll deleted file mode 100644 index 6cdcfe354c69b..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll +++ /dev/null @@ -1,388 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s - -@global_var = addrspace(1) global i32 0, align 4 - -define amdgpu_kernel void @fence_release(ptr addrspace(1) %ptr, i32 %val1, i32 %val2) { -; GFX9-LABEL: fence_release: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_release: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: s_getpc_b64 s[4:5] -; GFX11-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_release: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: s_getpc_b64 s[4:5] -; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v2, s[4:5] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - store i32 %val1, ptr addrspace(1) %ptr - fence release - store volatile i32 %val2, ptr addrspace(1) @global_var - ret void -} - -define amdgpu_kernel void @fence_acquire(ptr addrspace(1) %ptr) { -; GFX9-LABEL: fence_acquire: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_acquire: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_acquire: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - fence acquire - %val = load i32, ptr addrspace(1) %ptr - store volatile i32 %val, ptr addrspace(1) @global_var - ret void -} - -define amdgpu_kernel void @fence_acq_rel(ptr addrspace(1) %ptr, i32 %val) { -; GFX9-LABEL: fence_acq_rel: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_acq_rel: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_acq_rel: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - store i32 %val, ptr addrspace(1) %ptr - fence acq_rel - %load = load i32, ptr addrspace(1) %ptr - store volatile i32 %load, ptr addrspace(1) @global_var - ret void -} - -define amdgpu_kernel void @fence_seq_cst(ptr addrspace(1) %ptr, i32 %val) { -; GFX9-LABEL: fence_seq_cst: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_seq_cst: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_seq_cst: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - store i32 %val, ptr addrspace(1) %ptr - fence seq_cst - %load = load i32, ptr addrspace(1) %ptr - store volatile i32 %load, ptr addrspace(1) @global_var - ret void -} - -define amdgpu_kernel void @fence_workgroup_release(ptr addrspace(3) %ptr, i32 %val1, i32 %val2) { -; GFX9-LABEL: fence_workgroup_release: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_workgroup_release: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: s_getpc_b64 s[4:5] -; GFX11-NEXT: s_add_u32 s4, s4, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_mov_b32_e32 v3, s2 -; GFX11-NEXT: ds_store_b32 v1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v2, v3, s[4:5] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_workgroup_release: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_getpc_b64 s[4:5] -; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 -; GFX12-NEXT: v_mov_b32_e32 v3, s2 -; GFX12-NEXT: ds_store_b32 v1, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_store_b32 v2, v3, s[4:5] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - store i32 %val1, ptr addrspace(3) %ptr - fence syncscope("workgroup") release - store volatile i32 %val2, ptr addrspace(1) @global_var - ret void -} - -define amdgpu_kernel void @fence_agent_seq_cst(ptr addrspace(1) %ptr, i32 %val) { -; GFX9-LABEL: fence_agent_seq_cst: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: fence_agent_seq_cst: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, global_var@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fence_agent_seq_cst: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_endpgm - store i32 %val, ptr addrspace(1) %ptr - fence syncscope("agent") seq_cst - %load = load i32, ptr addrspace(1) %ptr - store volatile i32 %load, ptr addrspace(1) @global_var - ret void -}