Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Stack probing for dynamic allocas in SelectionDAG #66525

Merged
28 changes: 26 additions & 2 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
/// included as part of the stack frame.
bool
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
// The stack probing code for the dynamically allocated outgoing arguments
// area assumes that the stack is probed at the top - either by the prologue
// code, which issues a probe if `hasVarSizedObjects` return true, or by the
// most recent variable-sized object allocation. Changing the condition here
// may need to be followed up by changes to the probe issuing logic.
return !MF.getFrameInfo().hasVarSizedObjects();
}

Expand All @@ -487,6 +492,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
const AArch64TargetLowering *TLI =
MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
MachineFrameInfo &MFI = MF.getFrameInfo();
DebugLoc DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
Expand All @@ -513,8 +521,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(Amount), TII);

if (TLI->hasInlineStackProbe(MF) &&
-Amount >= AArch64::StackProbeMaxUnprobedStack) {
// When stack probing is enabled, the decrement of SP may need to be
// probed. We only need to do this if the call site needs 1024 bytes of
// space or more, because a region smaller than that is allowed to be
// unprobed at an ABI boundary. We rely on the fact that SP has been
// probed exactly at this point, either by the prologue or most recent
// dynamic allocation.
assert(MFI.hasVarSizedObjects() &&
"non-reserved call frame without var sized objects?");
Register ScratchReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
} else {
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(Amount), TII);
}
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
Expand Down
154 changes: 102 additions & 52 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -569,10 +569,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSHL, MVT::i32, Custom);
setOperationAction(ISD::FSHL, MVT::i64, Custom);

if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);

// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
Expand Down Expand Up @@ -2353,6 +2350,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::CSINC)
MAKE_CASE(AArch64ISD::THREAD_POINTER)
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
MAKE_CASE(AArch64ISD::ABDS_PRED)
MAKE_CASE(AArch64ISD::ABDU_PRED)
MAKE_CASE(AArch64ISD::HADDS_PRED)
Expand Down Expand Up @@ -2719,6 +2717,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}

MachineBasicBlock *
AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction &MF = *MBB->getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
DebugLoc DL = MBB->findDebugLoc(MBBI);
const AArch64InstrInfo &TII =
*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
Register TargetReg = MI.getOperand(0).getReg();
MachineBasicBlock::iterator NextInst =
TII.probedStackAlloc(MBBI, TargetReg, false);

MI.eraseFromParent();
return NextInst->getParent();
}

MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
Expand Down Expand Up @@ -2863,6 +2877,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(

case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);

case AArch64::PROBED_STACKALLOC_DYN:
return EmitDynamicProbedAlloc(MI, BB);

case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
Expand Down Expand Up @@ -14052,9 +14070,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
AN->getMemOperand());
}

SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
SDValue
AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {

SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);

if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
PtrVT, 0);
Expand All @@ -14078,7 +14121,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(

Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
return Chain;

SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue
AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);

MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
SDLoc dl(Op);
EVT VT = Node->getValueType(0);

// Construct the new SP value in a GPR.
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));

// Set the real SP to the new value with a probing loop.
Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();

if (Subtarget->isTargetWindows())
return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
else if (hasInlineStackProbe(MF))
return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
else
return SDValue();
}

// When x and y are extended, lower:
Expand Down Expand Up @@ -14132,51 +14227,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
}

SDValue
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() &&
"Only Windows alloca probing supported");
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Align =
cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
EVT VT = Node->getValueType(0);

if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}

SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ enum NodeType : unsigned {
ADC,
SBC, // adc, sbc instructions

// To avoid stack clash, allocation is performed by block and each block is
// probed.
PROBED_ALLOCA,

// Predicated instructions where inactive lanes produce undefined results.
ABDS_PRED,
ABDU_PRED,
Expand Down Expand Up @@ -616,6 +620,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;

MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const;

MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
Expand Down Expand Up @@ -1141,10 +1148,10 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
SDValue &Size,
SelectionDAG &DAG) const;

SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;

SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;

def AArch64probedalloca
: SDNode<"AArch64ISD::PROBED_ALLOCA",
SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPMayStore]>;

def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
[SDNPHasChain, SDNPOutGlue]>;
Expand Down Expand Up @@ -965,6 +971,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
[]>,
Sched<[]>;

// Probed stack allocations of a variable size, used for allocas of unknown size
// when stack-clash protection is enabled.
let usesCustomInserter = 1 in
def PROBED_STACKALLOC_DYN : Pseudo<(outs),
(ins GPR64common:$target),
[(AArch64probedalloca GPR64common:$target)]>,
Sched<[]>;

} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1

Expand Down
14 changes: 14 additions & 0 deletions llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
target triple = "aarch64-linux"

; Check dynamic stack allocation and probing instructions do not have
; the FrameSetup flag.

; CHECK-NOT: frame-setup
define void @no_frame_setup(i64 %size, ptr %out) #0 {
%v = alloca i8, i64 %size, align 1
store ptr %v, ptr %out, align 8
ret void
}

attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
Loading