Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement inline stack probes for AArch64 #5353

Merged
merged 8 commits into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cranelift/codegen/meta/src/shared/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ pub(crate) fn define() -> SettingGroup {
"enable_probestack",
"Enable the use of stack probes for supported calling conventions.",
"",
true,
false,
);

settings.add_bool(
Expand Down
50 changes: 46 additions & 4 deletions cranelift/codegen/src/isa/aarch64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,14 +631,56 @@ impl ABIMachineSpec for AArch64MachineDeps {
insts
}

fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
fn gen_probestack(_insts: &mut SmallInstVec<Self::I>, _: u32) {
// TODO: implement if we ever require stack probes on an AArch64 host
// (unlikely unless Lucet is ported)
smallvec![]
unimplemented!("Stack probing is unimplemented on AArch64");
}

fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
unimplemented!("Inline stack probing is unimplemented on AArch64");
fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
// The stack probe loop currently takes 6 instructions and each inline
// probe takes 2 (ish, these numbers sort of depend on the constants).
// Set this to 3 to keep the max size of the probe to 6 instructions.
const PROBE_MAX_UNROLL: u32 = 3;

let probe_count = align_to(frame_size, guard_size) / guard_size;
if probe_count <= PROBE_MAX_UNROLL {
// When manually unrolling stick an instruction that stores 0 at a
// constant offset relative to the stack pointer. This will
// turn into something like `movn tmp, #n ; stur xzr [sp, tmp]`.
//
// Note that this may actually store beyond the stack size for the
// last item but that's ok since it's unused stack space and if
// that faults accidentally we're so close to faulting it shouldn't
// make too much difference to fault there.
insts.reserve(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;
insts.push(Self::gen_store_stack(
StackAMode::SPOffset(-offset, I8),
zero_reg(),
I32,
));
}
} else {
// The non-unrolled version uses two temporary registers. The
// `start` contains the current offset from sp and counts downwards
// during the loop by increments of `guard_size`. The `end` is
// the size of the frame and where we stop.
//
// Note that this emission is all post-regalloc so it should be ok
// to use the temporary registers here as input/output as the loop
// itself is not allowed to use the registers.
let start = writable_spilltmp_reg();
alexcrichton marked this conversation as resolved.
Show resolved Hide resolved
let end = writable_tmp2_reg();
insts.extend(Inst::load_constant(start, 0));
insts.extend(Inst::load_constant(end, frame_size.into()));
insts.push(Inst::StackProbeLoop {
start,
end: end.to_reg(),
step: Imm12::maybe_from_u64(guard_size.into()).unwrap(),
});
}
}

// Returns stack bytes used as well as instructions. Does not adjust
Expand Down
16 changes: 12 additions & 4 deletions cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,16 @@

;; A dummy use, useful to keep a value alive.
(DummyUse
(reg Reg))))
(reg Reg))

;; Emits an inline stack probe loop.
;;
;; Note that this is emitted post-regalloc so `start` and `end` can be
;; temporary registers such as the spilltmp and tmp2 registers. This also
;; means that the internal codegen can't use these registers.
(StackProbeLoop (start WritableReg)
(end Reg)
(step Imm12))))

;; An ALU operation. This can be paired with several instruction formats
;; below (see `Inst`) in any combination.
Expand Down Expand Up @@ -1134,8 +1143,7 @@
;; [crate::isa::aarch64::abi](the ABI module) for more details.
(NominalSPOffset
(off i64)
(ty Type))
))
(ty Type))))

(type PairAMode extern (enum))
(type FPUOpRI extern (enum))
Expand Down Expand Up @@ -2242,7 +2250,7 @@
(let ((dst WritableReg (temp_writable_reg $I64)))
(ConsumesFlags.ConsumesFlagsReturnsResultWithProducer (MInst.CSet dst cond) dst)))

;; Helper for constructing `csetm` instructions.
;; Helper for constructing `csetm` instructions.
(decl csetm (Cond) ConsumesFlags)
(rule (csetm cond)
(let ((dst WritableReg (temp_writable_reg $I64)))
Expand Down
89 changes: 74 additions & 15 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,14 @@ pub fn mem_finalize(
(smallvec![], mem)
} else {
let tmp = writable_spilltmp_reg();
let mut const_insts = Inst::load_constant(tmp, off as u64);
// N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
// (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
// is a valid base (for SPOffset) which we must handle here.
// Also, SP needs to be the first arg, not second.
let add_inst = Inst::AluRRRExtend {
alu_op: ALUOp::Add,
size: OperandSize::Size64,
rd: tmp,
rn: basereg,
rm: tmp.to_reg(),
extendop: ExtendOp::UXTX,
};
const_insts.push(add_inst);
(const_insts, AMode::reg(tmp.to_reg()))
(
Inst::load_constant(tmp, off as u64),
AMode::RegExtended {
rn: basereg,
rm: tmp.to_reg(),
extendop: ExtendOp::SXTX,
},
)
}
}

Expand Down Expand Up @@ -3424,6 +3417,72 @@ impl MachInstEmit for Inst {
}

&Inst::DummyUse { .. } => {}

&Inst::StackProbeLoop { start, end, step } => {
assert!(emit_info.0.enable_probestack());
let start = allocs.next_writable(start);
let end = allocs.next(end);

// The loop generated here uses `start` as a counter register to
// count backwards until negating it exceeds `end`. In other
// words `start` is an offset from `sp` we're testing where
// `end` is the max size we need to test. The loop looks like:
//
// loop_start:
// sub start, start, #step
// stur xzr, [sp, start]
// cmn start, end
// br.gt loop_start
// loop_end:
//
// Note that this loop cannot use the spilltmp and tmp2
// registers as those are currently used as the input to this
// loop when generating the instruction. This means that some
// more flavorful address modes and lowerings need to be
// avoided.
//
// Perhaps someone more clever than I can figure out how to use
// `subs` or the like and skip the `cmn`, but I can't figure it
// out at this time.
alexcrichton marked this conversation as resolved.
Show resolved Hide resolved

let loop_start = sink.get_label();
sink.bind_label(loop_start);

Inst::AluRRImm12 {
alu_op: ALUOp::Sub,
size: OperandSize::Size64,
rd: start,
rn: start.to_reg(),
imm12: step,
}
.emit(&[], sink, emit_info, state);
Inst::Store32 {
rd: regs::zero_reg(),
mem: AMode::RegReg {
rn: regs::stack_reg(),
rm: start.to_reg(),
},
flags: MemFlags::trusted(),
}
.emit(&[], sink, emit_info, state);
Inst::AluRRR {
alu_op: ALUOp::AddS,
size: OperandSize::Size64,
rd: regs::writable_zero_reg(),
rn: start.to_reg(),
rm: end,
}
.emit(&[], sink, emit_info, state);

let loop_end = sink.get_label();
Inst::CondBr {
taken: BranchTarget::Label(loop_start),
not_taken: BranchTarget::Label(loop_end),
kind: CondBrKind::Cond(Cond::Gt),
}
.emit(&[], sink, emit_info, state);
sink.bind_label(loop_end);
}
}

let end_off = sink.cur_offset();
Expand Down
20 changes: 10 additions & 10 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1789,8 +1789,8 @@ fn test_aarch64_binemit() {
mem: AMode::FPOffset { off: 32768, ty: I8 },
flags: MemFlags::trusted(),
},
"100090D2B063308B010240F9",
"movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
"100090D2A1EB70F8",
"movz x16, #32768 ; ldr x1, [fp, x16, SXTX]",
));
insns.push((
Inst::ULoad64 {
Expand All @@ -1801,8 +1801,8 @@ fn test_aarch64_binemit() {
},
flags: MemFlags::trusted(),
},
"F0FF8F92B063308B010240F9",
"movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
"F0FF8F92A1EB70F8",
"movn x16, #32767 ; ldr x1, [fp, x16, SXTX]",
));
insns.push((
Inst::ULoad64 {
Expand All @@ -1813,8 +1813,8 @@ fn test_aarch64_binemit() {
}, // 2^20
flags: MemFlags::trusted(),
},
"1002A0D2B063308B010240F9",
"movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
"1002A0D2A1EB70F8",
"movz x16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
));
insns.push((
Inst::ULoad64 {
Expand All @@ -1825,8 +1825,8 @@ fn test_aarch64_binemit() {
}, // 2^20 + 1
flags: MemFlags::trusted(),
},
"300080521002A072B063308B010240F9",
"movz w16, #1 ; movk w16, w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
"300080521002A072A1EB70F8",
"movz w16, #1 ; movk w16, w16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
));

insns.push((
Expand Down Expand Up @@ -1867,8 +1867,8 @@ fn test_aarch64_binemit() {
},
flags: MemFlags::trusted(),
},
"1002A0D2F060308B010240F9",
"movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
"1002A0D2E1E870F8",
"movz x16, #16, LSL #16 ; ldr x1, [x7, x16, SXTX]",
));

insns.push((
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1103,6 +1103,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
&Inst::DummyUse { reg } => {
collector.reg_use(reg);
}
&Inst::StackProbeLoop { start, end, .. } => {
collector.reg_early_def(start);
collector.reg_use(end);
}
}
}

Expand Down Expand Up @@ -2887,6 +2891,12 @@ impl Inst {
let reg = pretty_print_reg(reg, allocs);
format!("dummy_use {}", reg)
}
&Inst::StackProbeLoop { start, end, step } => {
let start = pretty_print_reg(start.to_reg(), allocs);
let end = pretty_print_reg(end, allocs);
let step = step.pretty_print(0, allocs);
format!("stack_probe_loop {start}, {end}, {step}")
}
}
}
}
Expand Down
22 changes: 10 additions & 12 deletions cranelift/codegen/src/isa/riscv64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,7 @@ impl ABIMachineSpec for Riscv64MachineDeps {
insts
}

fn gen_probestack(frame_size: u32) -> SmallInstVec<Self::I> {
let mut insts = SmallVec::new();
fn gen_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32) {
insts.extend(Inst::load_constant_u32(writable_a0(), frame_size as u64));
insts.push(Inst::Call {
info: Box::new(CallInfo {
Expand All @@ -377,7 +376,6 @@ impl ABIMachineSpec for Riscv64MachineDeps {
caller_callconv: CallConv::SystemV,
}),
});
insts
}
// Returns stack bytes used as well as instructions. Does not adjust
// nominal SP offset; abi_impl generic code will do that.
Expand Down Expand Up @@ -632,16 +630,16 @@ impl ABIMachineSpec for Riscv64MachineDeps {
|| fixed_frame_storage_size > 0
}

fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec<Self::I> {
fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
// Unroll at most n consecutive probes, before falling back to using a loop
const PROBE_MAX_UNROLL: u32 = 3;
// Number of probes that we need to perform
let probe_count = align_to(frame_size, guard_size) / guard_size;

if probe_count <= PROBE_MAX_UNROLL {
Self::gen_probestack_unroll(guard_size, probe_count)
Self::gen_probestack_unroll(insts, guard_size, probe_count)
} else {
Self::gen_probestack_loop(guard_size, probe_count)
Self::gen_probestack_loop(insts, guard_size, probe_count)
}
}
}
Expand Down Expand Up @@ -697,8 +695,8 @@ fn compute_clobber_size(clobbers: &[Writable<RealReg>]) -> u32 {
}

impl Riscv64MachineDeps {
fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
let mut insts = SmallVec::with_capacity(probe_count as usize);
fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
insts.reserve(probe_count as usize);
for i in 0..probe_count {
let offset = (guard_size * (i + 1)) as i64;
insts.push(Self::gen_store_stack(
Expand All @@ -707,13 +705,13 @@ impl Riscv64MachineDeps {
I32,
));
}
insts
}
fn gen_probestack_loop(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
smallvec![Inst::StackProbeLoop {

fn gen_probestack_loop(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
insts.push(Inst::StackProbeLoop {
guard_size,
probe_count,
tmp: Writable::from_reg(x_reg(28)), // t3
}]
});
}
}
10 changes: 7 additions & 3 deletions cranelift/codegen/src/isa/s390x/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -569,13 +569,17 @@ impl ABIMachineSpec for S390xMachineDeps {
SmallVec::new()
}

fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
fn gen_probestack(_insts: &mut SmallInstVec<Self::I>, _: u32) {
// TODO: implement if we ever require stack probes on an s390x host
// (unlikely unless Lucet is ported)
smallvec![]
unimplemented!("Stack probing is unimplemented on S390x");
}

fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
fn gen_inline_probestack(
_insts: &mut SmallInstVec<Self::I>,
_frame_size: u32,
_guard_size: u32,
) {
unimplemented!("Inline stack probing is unimplemented on S390x");
}

Expand Down
Loading