Skip to content

Commit

Permalink
[AMDGPU] Update base addr of dyn alloca considering GrowingUp stack (#…
Browse files Browse the repository at this point in the history
…119822)

Currently, compiler calculates the base address of
dynamic sized stack object (alloca) as follows:
1. `NewSP = Align(CurrSP + Size)`
_where_ `Size = # of elements * wave size * alloca type`
2. `BaseAddr = NewSP`
3. The alignment is computed as: `AlignedAddr = Addr & ~(Alignment - 1)`
4. Return the `BaseAddr`
This makes sense when stack is grows downwards.

AMDGPU stack grows upwards, the base address 
needs to be aligned first and SP bump by required size later:
1. `BaseAddr = Align(CurrSP)`
2. `NewSP = BaseAddr + Size`
3. `AlignedAddr = (Addr + (Alignment - 1)) & ~(Alignment - 1)`
4. and returns the `BaseAddr`.
  • Loading branch information
easyonaadit authored Dec 20, 2024
1 parent a73ca29 commit c760671
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 215 deletions.
13 changes: 8 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1204,15 +1204,18 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);

auto SPCopy = B.buildCopy(PtrTy, SPReg);
auto OldSP = B.buildCopy(PtrTy, SPReg);
if (Alignment > TFI.getStackAlign()) {
auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
B.buildMaskLowPtrBits(Dst, PtrAdd,
auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
B.buildConstant(LLT::scalar(32), StackAlignMask));
B.buildMaskLowPtrBits(Dst, Tmp1,
Log2(Alignment) + ST.getWavefrontSizeLog2());
} else {
B.buildPtrAdd(Dst, SPCopy, ScaledSize);
B.buildCopy(Dst, OldSP);
}

auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
B.buildCopy(SPReg, PtrAdd);
MI.eraseFromParent();
return true;
}
Expand Down
37 changes: 21 additions & 16 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4016,8 +4016,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}

// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for applying the wave size scale to the increment amount.
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
// applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
Expand All @@ -4037,31 +4038,35 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();

const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
"Stack grows upwards for AMDGPU");

Chain = BaseAddr.getValue(1);
Align StackAlign = TFL->getStackAlign();
if (Alignment > StackAlign) {
uint64_t ScaledAlignment = (uint64_t)Alignment.value()
<< Subtarget->getWavefrontSizeLog2();
uint64_t StackAlignMask = ScaledAlignment - 1;
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
DAG.getConstant(StackAlignMask, dl, VT));
BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
}

SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));

Align StackAlign = TFL->getStackAlign();
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
if (Alignment && *Alignment > StackAlign) {
Tmp1 = DAG.getNode(
ISD::AND, dl, VT, Tmp1,
DAG.getSignedConstant(-(uint64_t)Alignment->value()
<< Subtarget->getWavefrontSizeLog2(),
dl, VT));
}
SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

return DAG.getMergeValues({Tmp1, Tmp2}, dl);
return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
}

SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Expand Down
Loading

0 comments on commit c760671

Please sign in to comment.