Skip to content

[AMDGPU] Move architected SGPR implementation into isel #79120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
const ArgDescriptor *Arg;
const ArgDescriptor *Arg = nullptr;
const TargetRegisterClass *ArgRC;
LLT ArgTy;
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);

CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
// If GridZ is not programmed in an entry function then the hardware will set
// it to all zeros, so there is no need to mask the GridY value in the low
// order bits.
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
AMDGPU::TTMP7,
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
switch (ArgType) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
Arg = &WorkGroupIDY;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
Arg = &WorkGroupIDZ;
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
default:
break;
}
}

if (!Arg)
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);

if (!Arg) {
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
Expand Down
68 changes: 49 additions & 19 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2063,11 +2063,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
const ArgDescriptor *Reg;
const ArgDescriptor *Reg = nullptr;
const TargetRegisterClass *RC;
LLT Ty;

std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
const ArgDescriptor WorkGroupIDX =
ArgDescriptor::createRegister(AMDGPU::TTMP9);
// If GridZ is not programmed in an entry function then the hardware will set
// it to all zeros, so there is no need to mask the GridY value in the low
// order bits.
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
AMDGPU::TTMP7,
AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
RC = &AMDGPU::SReg_32RegClass;
Ty = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
Reg = &WorkGroupIDY;
RC = &AMDGPU::SReg_32RegClass;
Ty = LLT::scalar(32);
break;
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
Reg = &WorkGroupIDZ;
RC = &AMDGPU::SReg_32RegClass;
Ty = LLT::scalar(32);
break;
default:
break;
}
}

if (!Reg)
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
if (!Reg) {
if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
// It's possible for a kernarg intrinsic call to appear in a kernel with
Expand Down Expand Up @@ -2496,28 +2530,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
}
}

if (Info.hasWorkGroupIDX()) {
Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
if (!HasArchitectedSGPRs)
if (!HasArchitectedSGPRs) {
if (Info.hasWorkGroupIDX()) {
Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}

CCInfo.AllocateReg(Reg);
}

if (Info.hasWorkGroupIDY()) {
Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
if (!HasArchitectedSGPRs)
if (Info.hasWorkGroupIDY()) {
Register Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}

CCInfo.AllocateReg(Reg);
}

if (Info.hasWorkGroupIDZ()) {
Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
if (!HasArchitectedSGPRs)
if (Info.hasWorkGroupIDZ()) {
Register Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

CCInfo.AllocateReg(Reg);
CCInfo.AllocateReg(Reg);
}
}

if (Info.hasWorkGroupInfo()) {
Expand Down
32 changes: 9 additions & 23 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -744,35 +744,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
}

// Add system SGPRs.
Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
Register Reg =
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
if (!HasArchitectedSGPRs)
NumSystemSGPRs += 1;

Register addWorkGroupIDX() {
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDX.getRegister();
}

Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
Register Reg =
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
if (!HasArchitectedSGPRs)
NumSystemSGPRs += 1;

Register addWorkGroupIDY() {
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDY.getRegister();
}

Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
Register Reg =
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
if (!HasArchitectedSGPRs)
NumSystemSGPRs += 1;

Register addWorkGroupIDZ() {
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDZ.getRegister();
}

Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
; GFX12-NEXT: v_mov_b32_e32 v31, v0
; GFX12-NEXT: s_mov_b32 s12, ttmp9
; GFX12-NEXT: s_mov_b64 s[8:9], 0
; GFX12-NEXT: s_mov_b32 s32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
Expand Down
Loading