Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Stack probe support for x86 #21

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/llvm/CodeGen/MachineFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ class MachineFunction {
/// Should we be emitting segmented stack stuff for the function
bool shouldSplitStack();

/// Should we be probing the stack for the function
bool shouldProbeStack();

/// getNumBlockIDs - Return the number of MBB ID's allocated.
///
unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); }
Expand Down
5 changes: 5 additions & 0 deletions lib/CodeGen/MachineFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ bool MachineFunction::shouldSplitStack() {
return getFunction()->hasFnAttribute("split-stack");
}

/// Should we be probing the stack for the function
bool MachineFunction::shouldProbeStack() {
return getFunction()->hasFnAttribute("probe-stack");
}

/// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
/// recomputes them. This guarantees that the MBB numbers are sequential,
/// dense, and match the ordering of the blocks within the function. If a
Expand Down
139 changes: 84 additions & 55 deletions lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
X86FI->setCalleeSavedFrameSize(
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);

bool UseRedZone = false;
bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho()) ||
MF.shouldProbeStack();

// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
// function, and use up to 128 bytes of stack space, don't have a frame
// pointer, calls, or dynamic alloca then we do not need to adjust the
Expand All @@ -493,12 +497,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
!MFI->hasVarSizedObjects() && // No dynamic alloca.
!MFI->adjustsStack() && // No calls.
!IsWin64 && // Win64 has no Red Zone
!(UseStackProbe && StackSize > 128) && // No stack probes
!usesTheStack(MF) && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
if (HasFP) MinSize += SlotSize;
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
MFI->setStackSize(StackSize);
UseRedZone = true;
}

// Insert stack pointer adjustment for later moving of return addr. Only
Expand Down Expand Up @@ -663,71 +669,94 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// responsible for adjusting the stack pointer. Touching the stack at 4K
// increments is necessary to ensure that the guard pages used by the OS
// virtual memory manager are allocated in correct sequence.
if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) {
const char *StackProbeSymbol;
if (NumBytes >= 4096 && UseStackProbe) {
assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");

if (Is64Bit) {
if (STI.isTargetCygMing()) {
StackProbeSymbol = "___chkstk_ms";
if (NumBytes <= 0x5000) {
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
StackPtr)
.addReg(StackPtr)
.addImm(NumBytes)
.setMIFlag(MachineInstr::FrameSetup);

for (uint64_t i = 0; i < NumBytes / 0x1000; ++i) {
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::OR64mi8 : X86::OR32mi8))
.addReg(StackPtr)
.addImm(1)
.addReg(0)
.addImm(NumBytes - (i + 1) * 0x1000)
.addReg(0)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
}
} else {
const char *StackProbeSymbol;

if (STI.isOSWindows()) {
if (Is64Bit) {
if (STI.isTargetCygMing()) {
StackProbeSymbol = "___chkstk_ms";
} else {
StackProbeSymbol = "__chkstk";
}
} else if (STI.isTargetCygMing())
StackProbeSymbol = "_alloca";
else
StackProbeSymbol = "_chkstk";
} else {
StackProbeSymbol = "__chkstk";
StackProbeSymbol = "__probestack";
}
} else if (STI.isTargetCygMing())
StackProbeSymbol = "_alloca";
else
StackProbeSymbol = "_chkstk";

// Check whether EAX is livein for this function.
bool isEAXAlive = isEAXLiveIn(MF);
// Check whether the accumulator register is livein for this function.
bool isRegAccAlive = isEAXLiveIn(MF);
auto RegAcc = Is64Bit ? X86::RAX : X86::EAX;

if (isEAXAlive) {
// Sanity check that EAX is not livein for this function.
// It should not be, so throw an assert.
assert(!Is64Bit && "EAX is livein in x64 case!");
if (isRegAccAlive) {
// Save RegAcc
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
.addReg(RegAcc, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
}

// Save EAX
BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
.addReg(X86::EAX, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);
}
uint64_t NumBytesAdj = isRegAccAlive ? NumBytes - (Is64Bit ? 8 : 4) :
NumBytes;

if (Is64Bit) {
// Handle the 64-bit Windows ABI case where we need to call __chkstk.
// Function prologue is responsible for adjusting the stack pointer.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
.addImm(NumBytes)
// Allocate NumBytesAdj bytes on stack in case of isRegAccAlive.
// We'll also use 8/4 already allocated bytes for EAX.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64ri : X86::MOV32ri),
RegAcc)
.addImm(NumBytesAdj)
.setMIFlag(MachineInstr::FrameSetup);
} else {
// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
// We'll also use 4 already allocated bytes for EAX.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
.addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
.setMIFlag(MachineInstr::FrameSetup);
}

BuildMI(MBB, MBBI, DL,
TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
.addExternalSymbol(StackProbeSymbol)
.addReg(StackPtr, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);

if (Is64Bit) {
// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
// themself. It also does not clobber %rax so we can reuse it when
// adjusting %rsp.
BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
.addReg(StackPtr)
.addReg(X86::RAX)
auto CallOp = Is64Bit ? (STI.isOSWindows() ? X86::W64ALLOCA :
X86::CALL64pcrel32) :
X86::CALLpcrel32;
BuildMI(MBB, MBBI, DL,
TII.get(CallOp))
.addExternalSymbol(StackProbeSymbol)
.addReg(StackPtr, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
}
if (isEAXAlive) {
// Restore EAX
MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
X86::EAX),
StackPtr, false, NumBytes - 4);
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);

if (Is64Bit || !STI.isOSWindows()) {
// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
// themself. It also does not clobber %rax so we can reuse it when
// adjusting %rsp.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::SUB64rr : X86::SUB32rr),
StackPtr)
.addReg(StackPtr)
.addReg(RegAcc)
.setMIFlag(MachineInstr::FrameSetup);
}
if (isRegAccAlive) {
// Restore RegAcc
auto MIB = BuildMI(MF, DL,
TII.get(Is64Bit ? X86::MOV64rm : X86::MOV32rm),
RegAcc);
MachineInstr *MI = addRegOffset(MIB, StackPtr, false, NumBytesAdj);
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);
}
}
} else if (NumBytes) {
emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
Expand Down
37 changes: 25 additions & 12 deletions lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13581,7 +13581,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
SplitStack;
SplitStack || MF.shouldProbeStack();
SDLoc dl(Op);

if (!Lower) {
Expand Down Expand Up @@ -18093,7 +18093,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
// The lowering is pretty easy: we're just emitting the call to _alloca. The
// non-trivial part is impdef of ESP.

if (Subtarget->isTargetWin64()) {
if (Subtarget->isTargetWin64() || !Subtarget->isOSWindows()) {
if (Subtarget->isTargetCygMing()) {
// ___chkstk(Mingw64):
// Clobbers R10, R11, RAX and EFLAGS.
Expand All @@ -18106,16 +18106,29 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
.addReg(X86::RSP, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
} else {
// __chkstk(MSVCRT): does not update stack pointer.
// Clobbers R10, R11 and EFLAGS.
BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
.addExternalSymbol("__chkstk")
.addReg(X86::RAX, RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
// RAX has the offset to be subtracted from RSP.
BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
.addReg(X86::RSP)
.addReg(X86::RAX);
const char *StackProbeSymbol =
Subtarget->isOSWindows() ? "__chkstk" : "__probestack";
if (Subtarget->is64Bit()) {
// __chkstk(MSVCRT): does not update stack pointer.
// Clobbers R10, R11 and EFLAGS.
BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
.addExternalSymbol(StackProbeSymbol)
.addReg(X86::RAX, RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
// RAX has the offset to be subtracted from RSP.
BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
.addReg(X86::RSP)
.addReg(X86::RAX);
} else {
BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
.addExternalSymbol(StackProbeSymbol)
.addReg(X86::EAX, RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
// EAX has the offset to be subtracted from ESP.
BuildMI(*BB, MI, DL, TII->get(X86::SUB32rr), X86::ESP)
.addReg(X86::ESP)
.addReg(X86::EAX);
}
}
} else {
const char *StackProbeSymbol =
Expand Down
4 changes: 4 additions & 0 deletions lib/Transforms/IPO/Inliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,

AdjustCallerSSPLevel(Caller, Callee);

if (Callee->hasFnAttribute("probe-stack")) {
Caller->addFnAttr("probe-stack", "");
}

// Look at all of the allocas that we inlined through this call site. If we
// have already inlined other allocas through other calls into this function,
// then we know that they have disjoint lifetimes and that we can merge them.
Expand Down
8 changes: 4 additions & 4 deletions test/CodeGen/X86/mingw-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ entry:
; COFF: andl $-16, %esp
; COFF: pushl %eax
; COFF: calll __alloca
; COFF: movl 8028(%esp), %eax
; COFF: movl 80028(%esp), %eax
; ELF: foo2:
; ELF: andl $-16, %esp
; ELF: pushl %eax
; ELF: calll _alloca
; ELF: movl 8028(%esp), %eax
%A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1]
%A2.sub = getelementptr [2000 x i32]* %A2, i32 0, i32 0 ; <i32*> [#uses=1]
; ELF: movl 80028(%esp), %eax
%A2 = alloca [20000 x i32], align 16 ; <[20000 x i32]*> [#uses=1]
%A2.sub = getelementptr [20000 x i32]* %A2, i32 0, i32 0 ; <i32*> [#uses=1]
call void @bar2( i32* %A2.sub, i32 %N )
ret void
}
Expand Down
2 changes: 1 addition & 1 deletion test/CodeGen/X86/pr17631.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ allocas:

; CHECK: equal
; CHECK-NOT: vzeroupper
; CHECK: _chkstk
; CHECK: orl $0, 64(%esp)
; CHECK: ret

define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
Expand Down
37 changes: 37 additions & 0 deletions test/CodeGen/X86/stack-probes.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X86-Linux %s
; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=X64-Linux %s

declare void @use([40096 x i8]*)

define void @test() "probe-stack" {
%array = alloca [40096 x i8], align 16
call void @use([40096 x i8]* %array)
ret void

; X86-Linux-LABEL: test:
; X86-Linux: movl $40124, %eax
; X86-Linux-NEXT: calll __probestack
; X86-Linux-NEXT: subl %eax, %esp

; X64-Linux-LABEL: test:
; X64-Linux: movabsq $40104, %rax
; X64-Linux-NEXT: callq __probestack
; X64-Linux-NEXT: subq %rax, %rsp

}

declare void @use_fast([4096 x i8]*)

define void @test_fast() "probe-stack" {
%array = alloca [4096 x i8], align 16
call void @use_fast([4096 x i8]* %array)
ret void

; X86-Linux-LABEL: test_fast:
; X86-Linux: subl $4124, %esp
; X86-Linux-NEXT: orl $0, 28(%esp)

; X64-Linux-LABEL: test_fast:
; X64-Linux: subq $4104, %rsp
; X64-Linux-NEXT: orq $0, 8(%rsp)
}
12 changes: 6 additions & 6 deletions test/CodeGen/X86/win64_alloca_dynalloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,23 @@ define i64 @unaligned(i64 %n, i64 %x) nounwind {
; EFI-LABEL: unaligned:
entry:

%buf0 = alloca i8, i64 4096, align 1
%buf0 = alloca i8, i64 40096, align 1

; ___chkstk_ms does not adjust %rsp.
; M64: movq %rsp, %rbp
; M64: $4096, %rax
; M64: $40096, %rax
; M64: callq ___chkstk_ms
; M64: subq %rax, %rsp

; __chkstk does not adjust %rsp.
; W64: movq %rsp, %rbp
; W64: $4096, %rax
; W64: $40096, %rax
; W64: callq __chkstk
; W64: subq %rax, %rsp

; Freestanding
; EFI: movq %rsp, %rbp
; EFI: $[[B0OFS:4096|4104]], %rsp
; EFI: $[[B0OFS:40096|40104]], %rsp
; EFI-NOT: call

%buf1 = alloca i8, i64 %n, align 1
Expand All @@ -53,12 +53,12 @@ entry:

; M64: subq $48, %rsp
; M64: movq %rax, 32(%rsp)
; M64: leaq -4096(%rbp), %r9
; M64: leaq -40096(%rbp), %r9
; M64: callq bar

; W64: subq $48, %rsp
; W64: movq %rax, 32(%rsp)
; W64: leaq -4096(%rbp), %r9
; W64: leaq -40096(%rbp), %r9
; W64: callq bar

; EFI: subq $48, %rsp
Expand Down
8 changes: 4 additions & 4 deletions test/CodeGen/X86/win64_eh.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ entry:
; Checks a stack allocation requiring call to __chkstk/___chkstk_ms
define void @foo2() uwtable {
entry:
%baz = alloca [4000 x i16], align 2
%baz = alloca [40000 x i16], align 2
ret void
}
; WIN64-LABEL: foo2:
; WIN64: .seh_proc foo2
; WIN64: movabsq $8000, %rax
; WIN64: movabsq $80000, %rax
; WIN64: callq {{__chkstk|___chkstk_ms}}
; WIN64: subq %rax, %rsp
; WIN64: .seh_stackalloc 8000
; WIN64: .seh_stackalloc 80000
; WIN64: .seh_endprologue
; WIN64: addq $8000, %rsp
; WIN64: addq $80000, %rsp
; WIN64: ret
; WIN64: .seh_endproc

Expand Down
Loading