diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 51bd9b63c127e..4eaa720aced5f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2491,6 +2491,10 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn; def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn; def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn; +// i32 @llvm.amdgcn.wave.id() +def int_amdgcn_wave_id : + DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 060fb66d38f7b..ed2eb05d94a1d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1471,6 +1471,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureWavefrontSize32, FeatureShaderCyclesRegister, FeatureArchitectedFlatScratch, + FeatureArchitectedSGPRs, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureFlatAtomicFaddF32Inst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index de25f9241a503..54562cb79a9a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -75,15 +75,14 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { << " WorkGroupIDY: " << FI.second.WorkGroupIDY << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ << " WorkGroupInfo: " << FI.second.WorkGroupInfo + << " WaveID: " << FI.second.WaveID << " LDSKernelId: " << FI.second.LDSKernelId << " PrivateSegmentWaveByteOffset: " - << FI.second.PrivateSegmentWaveByteOffset + << FI.second.PrivateSegmentWaveByteOffset << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr - << " ImplicitArgPtr: " << FI.second.ImplicitArgPtr - << " WorkItemIDX " << FI.second.WorkItemIDX - << " WorkItemIDY " << FI.second.WorkItemIDY - << " WorkItemIDZ " << FI.second.WorkItemIDZ - << '\n'; + << " ImplicitArgPtr: " << FI.second.ImplicitArgPtr << " WorkItemIDX " + << FI.second.WorkItemIDX << " WorkItemIDY " << FI.second.WorkItemIDY + << " WorkItemIDZ " << FI.second.WorkItemIDZ << '\n'; } } @@ -108,6 +107,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue( case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); + case AMDGPUFunctionArgInfo::WAVE_ID: + return std::tuple(WaveID ? &WaveID : nullptr, &AMDGPU::SGPR_32RegClass, + LLT::scalar(32)); case AMDGPUFunctionArgInfo::LDS_KERNEL_ID: return std::tuple(LDSKernelId ? &LDSKernelId : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 42b33c50d9f8c..d3e8394c4a7d4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -111,6 +111,7 @@ struct AMDGPUFunctionArgInfo { WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, + WAVE_ID = 13, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, IMPLICIT_BUFFER_PTR = 15, IMPLICIT_ARG_PTR = 16, @@ -141,6 +142,7 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor WorkGroupIDY; ArgDescriptor WorkGroupIDZ; ArgDescriptor WorkGroupInfo; + ArgDescriptor WaveID; ArgDescriptor PrivateSegmentWaveByteOffset; // Pointer with offset from kernargsegmentptr to where special ABI arguments diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee288894518..ff029d6dc09a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -6922,6 +6922,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::WAVE_ID); case Intrinsic::amdgcn_lds_kernel_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::LDS_KERNEL_ID); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4f4bc45e49b43..213e35704f740 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2431,6 +2431,9 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, CCInfo.AllocateReg(Reg); } + if (HasArchitectedSGPRs && Info.hasWaveID()) + CCInfo.AllocateReg(Info.addWaveID()); + if (Info.hasWorkGroupInfo()) { Register Reg = Info.addWorkGroupInfo(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); @@ -2635,7 +2638,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert(!UserSGPRInfo.hasFlatScratchInit()); if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ()); + !Info->hasWorkGroupIDZ() && !Info->hasWaveID()); } if (CallConv == CallingConv::AMDGPU_PS) { @@ -7775,6 +7778,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_workgroup_id_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_wave_id: + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WAVE_ID); case Intrinsic::amdgcn_lds_kernel_id: { if (MFI->isEntryFunction()) return getLDSKernelId(DAG, DL); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index e8142244b7db6..84e33d2cd5959 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -39,10 +39,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), - WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), - PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), - WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { + WorkGroupIDZ(false), WorkGroupInfo(false), WaveID(false), + LDSKernelId(false), PrivateSegmentWaveByteOffset(false), + WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), + ImplicitArgPtr(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = *static_cast(STI); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -107,8 +107,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MayNeedAGPRs = false; // We will select all MAI with VGPR operands. } - if (!AMDGPU::isGraphics(CC) || - (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) { + bool HasArchitectedSGPRs = + CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs(); + if (!AMDGPU::isGraphics(CC) || HasArchitectedSGPRs) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; @@ -135,6 +136,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, LDSKernelId = true; } + // For GFX12+. + if (HasArchitectedSGPRs) + WaveID = true; + if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index dc63ae44c528d..1c93514e06cec 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -447,6 +447,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool WaveID : 1; bool LDSKernelId : 1; bool PrivateSegmentWaveByteOffset : 1; @@ -782,6 +783,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, return ArgInfo.WorkGroupInfo.getRegister(); } + // Supported for GFX12+. + Register addWaveID() { + ArgInfo.WaveID = ArgDescriptor::createRegister(AMDGPU::TTMP8, 0x1f << 25); + return ArgInfo.WaveID.getRegister(); + } + + bool hasWaveID() const { return WaveID; } + bool hasLDSKernelId() const { return LDSKernelId; } // Add special VGPR inputs diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index c732ff7094255..a9fe5cae38a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s define amdgpu_cs void @_amdgpu_cs_main() { ; GFX9-SDAG-LABEL: _amdgpu_cs_main: @@ -23,6 +25,30 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: _amdgpu_cs_main: +; GFX12-SDAG: ; %bb.0: ; %.entry +; GFX12-SDAG-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-SDAG-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: _amdgpu_cs_main: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_mov_b32 s0, ttmp9 +; GFX12-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff +; GFX12-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm .entry: %idx = call i32 @llvm.amdgcn.workgroup.id.x() %idy = call i32 @llvm.amdgcn.workgroup.id.y() @@ -68,6 +94,24 @@ define amdgpu_cs void @caller() { ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: caller: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-SDAG-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-SDAG-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: caller: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX12-GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo +; GFX12-GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi +; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call amdgpu_gfx void @callee(i32 %idx) ret void diff --git a/llvm/test/CodeGen/AMDGPU/wave-id-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/wave-id-intrinsic.ll new file mode 100644 index 0000000000000..35367bf52789c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wave-id-intrinsic.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s + +define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) { +; GFX9-LABEL: test_wave_id: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX12-LABEL: test_wave_id: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %waveid = call i32 @llvm.amdgcn.wave.id() + store i32 %waveid, ptr addrspace(1) %out + ret void +} + +declare i32 @llvm.amdgcn.wave.id() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index c492b54759d82..d4e9dcab3f03f 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -1,25 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel --verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { -; GCN-SDAG-LABEL: workgroup_id_x: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GCN-SDAG-NEXT: s_endpgm +; GFX9-SDAG-LABEL: workgroup_id_x: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm ; -; GCN-GISEL-LABEL: workgroup_id_x: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GCN-GISEL-NEXT: s_endpgm +; GFX9-GISEL-LABEL: workgroup_id_x: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: workgroup_id_x: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: workgroup_id_x: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx @@ -27,27 +49,29 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { } define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { -; GCN-SDAG-LABEL: workgroup_id_xy: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp7 -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GCN-SDAG-NEXT: s_endpgm +; GFX9-LABEL: workgroup_id_xy: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, ttmp9 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm ; -; GCN-GISEL-LABEL: workgroup_id_xy: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, ttmp9 -; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, ttmp7 -; GCN-GISEL-NEXT: global_store_dword v0, v1, s[2:3] -; GCN-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_xy: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: v_mov_b32_e32 v2, ttmp7 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx %idy = call i32 @llvm.amdgcn.workgroup.id.y() @@ -57,37 +81,56 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace } define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { -; GCN-SDAG-LABEL: workgroup_id_xyz: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GCN-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 -; GCN-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GCN-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[2:3] -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SDAG-NEXT: global_store_dword v0, v1, s[6:7] -; GCN-SDAG-NEXT: s_endpgm +; GFX9-SDAG-LABEL: workgroup_id_xyz: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, ttmp9 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-SDAG-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: workgroup_id_xyz: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-GISEL-NEXT: s_endpgm ; -; GCN-GISEL-LABEL: workgroup_id_xyz: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GCN-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GCN-GISEL-NEXT: s_and_b32 s0, ttmp7, 0xffff -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GCN-GISEL-NEXT: s_lshr_b32 s0, ttmp7, 16 -; GCN-GISEL-NEXT: global_store_dword v1, v0, s[2:3] -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GCN-GISEL-NEXT: global_store_dword v1, v0, s[6:7] -; GCN-GISEL-NEXT: s_endpgm +; GFX12-LABEL: workgroup_id_xyz: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9 +; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NEXT: global_store_b32 v0, v2, s[6:7] +; GFX12-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() store i32 %idx, ptr addrspace(1) %ptrx %idy = call i32 @llvm.amdgcn.workgroup.id.y()