diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 0c588c8495862..4e1026b68f8ad 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1299,6 +1299,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics. List AMDGPU intrinsics. +LLVM IR Metadata +------------------ + +The AMDGPU backend implements the following LLVM IR metadata. + +.. table:: AMDGPU LLVM IR Metadata + :name: amdgpu-llvm-ir-metadata-table + + ============================================== ========================================================== + LLVM IR Metadata Description + ============================================== ========================================================== + !amdgpu.last.use Sets TH_LOAD_LU temporal hint on load instructions that support it. + Takes priority over nontemporal hint (TH_LOAD_NT). + ============================================== ========================================================== + LLVM IR Attributes ------------------ diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 30a65bb332652..085ea4782686a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16177,9 +16177,12 @@ bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, MachineMemOperand::Flags SITargetLowering::getTargetMMOFlags(const Instruction &I) const { // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. + MachineMemOperand::Flags Flags = MachineMemOperand::MONone; if (I.getMetadata("amdgpu.noclobber")) - return MONoClobber; - return MachineMemOperand::MONone; + Flags |= MONoClobber; + if (I.getMetadata("amdgpu.last.use")) + Flags |= MOLastUse; + return Flags; } bool SITargetLowering::checkForPhysRegDependency( diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 73c23f0f987c3..62306fa667b36 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -98,22 +98,22 @@ class SIMemOpInfo final { bool IsCrossAddressSpaceOrdering = false; bool IsVolatile = false; bool IsNonTemporal = false; - - SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, - SIAtomicScope Scope = SIAtomicScope::SYSTEM, - SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, - SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, - bool IsCrossAddressSpaceOrdering = true, - AtomicOrdering FailureOrdering = - AtomicOrdering::SequentiallyConsistent, - bool IsVolatile = false, - bool IsNonTemporal = false) - : Ordering(Ordering), FailureOrdering(FailureOrdering), - Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), - InstrAddrSpace(InstrAddrSpace), - IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), - IsVolatile(IsVolatile), - IsNonTemporal(IsNonTemporal) { + bool IsLastUse = false; + + SIMemOpInfo( + AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, + SIAtomicScope Scope = SIAtomicScope::SYSTEM, + SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, + SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, + bool IsCrossAddressSpaceOrdering = true, + AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, + bool IsVolatile = false, bool IsNonTemporal = false, + bool IsLastUse = false) + : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), + OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), + IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), + IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal), + IsLastUse(IsLastUse) { if (Ordering == AtomicOrdering::NotAtomic) { assert(Scope == SIAtomicScope::NONE && @@ -201,6 +201,10 @@ class SIMemOpInfo final { return IsNonTemporal; } + /// \returns True if memory access of the machine instruction used to + /// create this SIMemOpInfo is last use, false otherwise. + bool isLastUse() const { return IsLastUse; } + /// \returns True if ordering constraint of the machine instruction used to /// create this SIMemOpInfo is unordered or higher, false otherwise. bool isAtomic() const { @@ -305,12 +309,13 @@ class SICacheControl { SIAtomicAddrSpace AddrSpace) const = 0; /// Update \p MI memory instruction of kind \p Op associated with address - /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return - /// true iff the instruction was modified. + /// spaces \p AddrSpace to indicate it is volatile and/or + /// nontemporal/last-use. Return true iff the instruction was modified. virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, - bool IsNonTemporal) const = 0; + bool IsNonTemporal, + bool IsLastUse = false) const = 0; virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { return false; @@ -394,8 +399,8 @@ class SIGfx6CacheControl : public SICacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -447,8 +452,8 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -508,8 +513,8 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; @@ -552,8 +557,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -578,8 +583,8 @@ class SIGfx11CacheControl : public SIGfx10CacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; }; class SIGfx12CacheControl : public SIGfx11CacheControl { @@ -614,8 +619,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, - bool IsNonTemporal) const override; + bool IsVolatile, bool IsNonTemporal, + bool IsLastUse) const override; bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; }; @@ -745,12 +750,14 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; bool IsNonTemporal = true; bool IsVolatile = false; + bool IsLastUse = false; // Validator should check whether or not MMOs cover the entire set of // locations accessed by the memory instruction. for (const auto &MMO : MI->memoperands()) { IsNonTemporal &= MMO->isNonTemporal(); IsVolatile |= MMO->isVolatile(); + IsLastUse |= MMO->getFlags() & MOLastUse; InstrAddrSpace |= toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); AtomicOrdering OpOrdering = MMO->getSuccessOrdering(); @@ -792,7 +799,7 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( } return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, - IsNonTemporal); + IsNonTemporal, IsLastUse); } std::optional @@ -969,7 +976,7 @@ bool SIGfx6CacheControl::enableRMWCacheBypass( bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. @@ -1322,7 +1329,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass( bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. @@ -1624,7 +1631,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass( bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not // be used for cache control. @@ -1856,7 +1863,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not @@ -2127,7 +2134,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass( bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write insructions. The // latter use glc to indicate if the atomic returns a result and so must not @@ -2379,7 +2386,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, - bool IsVolatile, bool IsNonTemporal) const { + bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const { // Only handle load and store, not atomic read-modify-write instructions. assert(MI->mayLoad() ^ MI->mayStore()); @@ -2392,7 +2399,10 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( bool Changed = false; - if (IsNonTemporal) { + if (IsLastUse) { + // Set last-use hint. + Changed |= setTH(MI, AMDGPU::CPol::TH_LU); + } else if (IsNonTemporal) { // Set non-temporal hint for all cache levels. Changed |= setTH(MI, AMDGPU::CPol::TH_NT); } @@ -2472,11 +2482,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, } // Atomic instructions already bypass caches to the scope specified by the - // SyncScope operand. Only non-atomic volatile and nontemporal instructions - // need additional treatment. - Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(), - SIMemOp::LOAD, MOI.isVolatile(), - MOI.isNonTemporal()); + // SyncScope operand. Only non-atomic volatile and nontemporal/last-use + // instructions need additional treatment. + Changed |= CC->enableVolatileAndOrNonTemporal( + MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(), + MOI.isNonTemporal(), MOI.isLastUse()); + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll new file mode 100644 index 0000000000000..71ec5512c72de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s + +define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { +; GFX12-LABEL: flat_last_use_load_0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm +entry: + %val = load i32, ptr %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { +; GFX12-LABEL: flat_last_use_load_1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid + %val = load i32, ptr %val.gep, align 4, !amdgpu.last.use !{} + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { +; GFX12-LABEL: flat_last_use_and_volatile_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm +entry: + %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) { +; GFX12-LABEL: flat_last_use_and_nontemporal_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm +entry: + %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 + store i32 %val, ptr %out + ret void +} + +!0 = !{i32 1} +declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-CU: {{.*}} +; GFX12-WGP: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll new file mode 100644 index 0000000000000..fa2fc3c423694 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s + +define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: global_last_use_load_0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: global_last_use_load_1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %val.gep, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: global_last_use_and_volatile_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: global_last_use_and_nontemporal_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %val.gep, align 4, !amdgpu.last.use !{}, !nontemporal !0 + store i32 %val, ptr addrspace(1) %out + ret void +} +!0 = !{i32 1} +declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-CU: {{.*}} +; GFX12-WGP: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll new file mode 100644 index 0000000000000..1ce9fc308af31 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s + +define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: private_last_use_load_0: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_LU +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: private_last_use_load_1: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, v0, s2 th:TH_LOAD_LU +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid + %val = load i32, ptr addrspace(5) %val.gep, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: private_last_use_and_volatile_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %val = load volatile i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{} + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) { +; GFX12-LABEL: private_last_use_and_nontemporal_load: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_LU +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 + store i32 %val, ptr addrspace(1) %out + ret void +} + +!0 = !{i32 1} +declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-CU: {{.*}} +; GFX12-WGP: {{.*}}