diff --git a/cranelift/codegen/meta/src/shared/settings.rs b/cranelift/codegen/meta/src/shared/settings.rs index d006d38815c2..b9fbef55c9e2 100644 --- a/cranelift/codegen/meta/src/shared/settings.rs +++ b/cranelift/codegen/meta/src/shared/settings.rs @@ -273,7 +273,7 @@ pub(crate) fn define() -> SettingGroup { "enable_probestack", "Enable the use of stack probes for supported calling conventions.", "", - true, + false, ); settings.add_bool( diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index be6d2d978b04..d57a804ef7d7 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -631,14 +631,56 @@ impl ABIMachineSpec for AArch64MachineDeps { insts } - fn gen_probestack(_: u32) -> SmallInstVec { + fn gen_probestack(_insts: &mut SmallInstVec, _: u32) { // TODO: implement if we ever require stack probes on an AArch64 host // (unlikely unless Lucet is ported) - smallvec![] + unimplemented!("Stack probing is unimplemented on AArch64"); } - fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec { - unimplemented!("Inline stack probing is unimplemented on AArch64"); + fn gen_inline_probestack(insts: &mut SmallInstVec, frame_size: u32, guard_size: u32) { + // The stack probe loop currently takes 6 instructions and each inline + // probe takes 2 (ish, these numbers sort of depend on the constants). + // Set this to 3 to keep the max size of the probe to 6 instructions. + const PROBE_MAX_UNROLL: u32 = 3; + + let probe_count = align_to(frame_size, guard_size) / guard_size; + if probe_count <= PROBE_MAX_UNROLL { + // When manually unrolling stick an instruction that stores 0 at a + // constant offset relative to the stack pointer. This will + // turn into something like `movn tmp, #n ; stur xzr [sp, tmp]`. + // + // Note that this may actually store beyond the stack size for the + // last item but that's ok since it's unused stack space and if + // that faults accidentally we're so close to faulting it shouldn't + // make too much difference to fault there. + insts.reserve(probe_count as usize); + for i in 0..probe_count { + let offset = (guard_size * (i + 1)) as i64; + insts.push(Self::gen_store_stack( + StackAMode::SPOffset(-offset, I8), + zero_reg(), + I32, + )); + } + } else { + // The non-unrolled version uses two temporary registers. The + // `start` contains the current offset from sp and counts downwards + // during the loop by increments of `guard_size`. The `end` is + // the size of the frame and where we stop. + // + // Note that this emission is all post-regalloc so it should be ok + // to use the temporary registers here as input/output as the loop + // itself is not allowed to use the registers. + let start = writable_spilltmp_reg(); + let end = writable_tmp2_reg(); + insts.extend(Inst::load_constant(start, 0)); + insts.extend(Inst::load_constant(end, frame_size.into())); + insts.push(Inst::StackProbeLoop { + start, + end: end.to_reg(), + step: Imm12::maybe_from_u64(guard_size.into()).unwrap(), + }); + } } // Returns stack bytes used as well as instructions. Does not adjust diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 46e8e842d421..b61ca529c997 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -935,7 +935,16 @@ ;; A dummy use, useful to keep a value alive. (DummyUse - (reg Reg)))) + (reg Reg)) + + ;; Emits an inline stack probe loop. + ;; + ;; Note that this is emitted post-regalloc so `start` and `end` can be + ;; temporary registers such as the spilltmp and tmp2 registers. This also + ;; means that the internal codegen can't use these registers. + (StackProbeLoop (start WritableReg) + (end Reg) + (step Imm12)))) ;; An ALU operation. This can be paired with several instruction formats ;; below (see `Inst`) in any combination. @@ -1134,8 +1143,7 @@ ;; [crate::isa::aarch64::abi](the ABI module) for more details. (NominalSPOffset (off i64) - (ty Type)) -)) + (ty Type)))) (type PairAMode extern (enum)) (type FPUOpRI extern (enum)) @@ -2242,7 +2250,7 @@ (let ((dst WritableReg (temp_writable_reg $I64))) (ConsumesFlags.ConsumesFlagsReturnsResultWithProducer (MInst.CSet dst cond) dst))) -;; Helper for constructing `csetm` instructions. +;; Helper for constructing `csetm` instructions. (decl csetm (Cond) ConsumesFlags) (rule (csetm cond) (let ((dst WritableReg (temp_writable_reg $I64))) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d2277b4ba6dd..0280a4c00f8b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -63,21 +63,14 @@ pub fn mem_finalize( (smallvec![], mem) } else { let tmp = writable_spilltmp_reg(); - let mut const_insts = Inst::load_constant(tmp, off as u64); - // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form - // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP - // is a valid base (for SPOffset) which we must handle here. - // Also, SP needs to be the first arg, not second. - let add_inst = Inst::AluRRRExtend { - alu_op: ALUOp::Add, - size: OperandSize::Size64, - rd: tmp, - rn: basereg, - rm: tmp.to_reg(), - extendop: ExtendOp::UXTX, - }; - const_insts.push(add_inst); - (const_insts, AMode::reg(tmp.to_reg())) + ( + Inst::load_constant(tmp, off as u64), + AMode::RegExtended { + rn: basereg, + rm: tmp.to_reg(), + extendop: ExtendOp::SXTX, + }, + ) } } @@ -3424,6 +3417,72 @@ impl MachInstEmit for Inst { } &Inst::DummyUse { .. } => {} + + &Inst::StackProbeLoop { start, end, step } => { + assert!(emit_info.0.enable_probestack()); + let start = allocs.next_writable(start); + let end = allocs.next(end); + + // The loop generated here uses `start` as a counter register to + // count backwards until negating it exceeds `end`. In other + // words `start` is an offset from `sp` we're testing where + // `end` is the max size we need to test. The loop looks like: + // + // loop_start: + // sub start, start, #step + // stur xzr, [sp, start] + // cmn start, end + // br.gt loop_start + // loop_end: + // + // Note that this loop cannot use the spilltmp and tmp2 + // registers as those are currently used as the input to this + // loop when generating the instruction. This means that some + // more flavorful address modes and lowerings need to be + // avoided. + // + // Perhaps someone more clever than I can figure out how to use + // `subs` or the like and skip the `cmn`, but I can't figure it + // out at this time. + + let loop_start = sink.get_label(); + sink.bind_label(loop_start); + + Inst::AluRRImm12 { + alu_op: ALUOp::Sub, + size: OperandSize::Size64, + rd: start, + rn: start.to_reg(), + imm12: step, + } + .emit(&[], sink, emit_info, state); + Inst::Store32 { + rd: regs::zero_reg(), + mem: AMode::RegReg { + rn: regs::stack_reg(), + rm: start.to_reg(), + }, + flags: MemFlags::trusted(), + } + .emit(&[], sink, emit_info, state); + Inst::AluRRR { + alu_op: ALUOp::AddS, + size: OperandSize::Size64, + rd: regs::writable_zero_reg(), + rn: start.to_reg(), + rm: end, + } + .emit(&[], sink, emit_info, state); + + let loop_end = sink.get_label(); + Inst::CondBr { + taken: BranchTarget::Label(loop_start), + not_taken: BranchTarget::Label(loop_end), + kind: CondBrKind::Cond(Cond::Gt), + } + .emit(&[], sink, emit_info, state); + sink.bind_label(loop_end); + } } let end_off = sink.cur_offset(); diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 8e06737f4617..b5e926b01197 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1789,8 +1789,8 @@ fn test_aarch64_binemit() { mem: AMode::FPOffset { off: 32768, ty: I8 }, flags: MemFlags::trusted(), }, - "100090D2B063308B010240F9", - "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + "100090D2A1EB70F8", + "movz x16, #32768 ; ldr x1, [fp, x16, SXTX]", )); insns.push(( Inst::ULoad64 { @@ -1801,8 +1801,8 @@ fn test_aarch64_binemit() { }, flags: MemFlags::trusted(), }, - "F0FF8F92B063308B010240F9", - "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + "F0FF8F92A1EB70F8", + "movn x16, #32767 ; ldr x1, [fp, x16, SXTX]", )); insns.push(( Inst::ULoad64 { @@ -1813,8 +1813,8 @@ fn test_aarch64_binemit() { }, // 2^20 flags: MemFlags::trusted(), }, - "1002A0D2B063308B010240F9", - "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + "1002A0D2A1EB70F8", + "movz x16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]", )); insns.push(( Inst::ULoad64 { @@ -1825,8 +1825,8 @@ fn test_aarch64_binemit() { }, // 2^20 + 1 flags: MemFlags::trusted(), }, - "300080521002A072B063308B010240F9", - "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]", + "300080521002A072A1EB70F8", + "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]", )); insns.push(( @@ -1867,8 +1867,8 @@ fn test_aarch64_binemit() { }, flags: MemFlags::trusted(), }, - "1002A0D2F060308B010240F9", - "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]", + "1002A0D2E1E870F8", + "movz x16, #16, LSL #16 ; ldr x1, [x7, x16, SXTX]", )); insns.push(( diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 2acef7e191e3..3ed883a49455 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -1103,6 +1103,10 @@ fn aarch64_get_operands VReg>(inst: &Inst, collector: &mut Operan &Inst::DummyUse { reg } => { collector.reg_use(reg); } + &Inst::StackProbeLoop { start, end, .. } => { + collector.reg_early_def(start); + collector.reg_use(end); + } } } @@ -2887,6 +2891,12 @@ impl Inst { let reg = pretty_print_reg(reg, allocs); format!("dummy_use {}", reg) } + &Inst::StackProbeLoop { start, end, step } => { + let start = pretty_print_reg(start.to_reg(), allocs); + let end = pretty_print_reg(end, allocs); + let step = step.pretty_print(0, allocs); + format!("stack_probe_loop {start}, {end}, {step}") + } } } } diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs index 7aeee8ba57d7..995a52fbc09d 100644 --- a/cranelift/codegen/src/isa/riscv64/abi.rs +++ b/cranelift/codegen/src/isa/riscv64/abi.rs @@ -360,8 +360,7 @@ impl ABIMachineSpec for Riscv64MachineDeps { insts } - fn gen_probestack(frame_size: u32) -> SmallInstVec { - let mut insts = SmallVec::new(); + fn gen_probestack(insts: &mut SmallInstVec, frame_size: u32) { insts.extend(Inst::load_constant_u32(writable_a0(), frame_size as u64)); insts.push(Inst::Call { info: Box::new(CallInfo { @@ -377,7 +376,6 @@ impl ABIMachineSpec for Riscv64MachineDeps { caller_callconv: CallConv::SystemV, }), }); - insts } // Returns stack bytes used as well as instructions. Does not adjust // nominal SP offset; abi_impl generic code will do that. @@ -632,16 +630,16 @@ impl ABIMachineSpec for Riscv64MachineDeps { || fixed_frame_storage_size > 0 } - fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec { + fn gen_inline_probestack(insts: &mut SmallInstVec, frame_size: u32, guard_size: u32) { // Unroll at most n consecutive probes, before falling back to using a loop const PROBE_MAX_UNROLL: u32 = 3; // Number of probes that we need to perform let probe_count = align_to(frame_size, guard_size) / guard_size; if probe_count <= PROBE_MAX_UNROLL { - Self::gen_probestack_unroll(guard_size, probe_count) + Self::gen_probestack_unroll(insts, guard_size, probe_count) } else { - Self::gen_probestack_loop(guard_size, probe_count) + Self::gen_probestack_loop(insts, guard_size, probe_count) } } } @@ -697,8 +695,8 @@ fn compute_clobber_size(clobbers: &[Writable]) -> u32 { } impl Riscv64MachineDeps { - fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec { - let mut insts = SmallVec::with_capacity(probe_count as usize); + fn gen_probestack_unroll(insts: &mut SmallInstVec, guard_size: u32, probe_count: u32) { + insts.reserve(probe_count as usize); for i in 0..probe_count { let offset = (guard_size * (i + 1)) as i64; insts.push(Self::gen_store_stack( @@ -707,13 +705,13 @@ impl Riscv64MachineDeps { I32, )); } - insts } - fn gen_probestack_loop(guard_size: u32, probe_count: u32) -> SmallInstVec { - smallvec![Inst::StackProbeLoop { + + fn gen_probestack_loop(insts: &mut SmallInstVec, guard_size: u32, probe_count: u32) { + insts.push(Inst::StackProbeLoop { guard_size, probe_count, tmp: Writable::from_reg(x_reg(28)), // t3 - }] + }); } } diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index 45e051f0216a..f123a4896697 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -569,13 +569,17 @@ impl ABIMachineSpec for S390xMachineDeps { SmallVec::new() } - fn gen_probestack(_: u32) -> SmallInstVec { + fn gen_probestack(_insts: &mut SmallInstVec, _: u32) { // TODO: implement if we ever require stack probes on an s390x host // (unlikely unless Lucet is ported) - smallvec![] + unimplemented!("Stack probing is unimplemented on S390x"); } - fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec { + fn gen_inline_probestack( + _insts: &mut SmallInstVec, + _frame_size: u32, + _guard_size: u32, + ) { unimplemented!("Inline stack probing is unimplemented on S390x"); } diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 91bde772c6a0..d2b9d6e18ee0 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -30,8 +30,8 @@ pub(crate) type X64Caller = Caller; pub struct X64ABIMachineSpec; impl X64ABIMachineSpec { - fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec { - let mut insts = SmallVec::with_capacity(probe_count as usize); + fn gen_probestack_unroll(insts: &mut SmallInstVec, guard_size: u32, probe_count: u32) { + insts.reserve(probe_count as usize); for i in 0..probe_count { let offset = (guard_size * (i + 1)) as i64; @@ -43,9 +43,8 @@ impl X64ABIMachineSpec { I32, )); } - insts } - fn gen_probestack_loop(frame_size: u32, guard_size: u32) -> SmallInstVec { + fn gen_probestack_loop(insts: &mut SmallInstVec, frame_size: u32, guard_size: u32) { // We have to use a caller saved register since clobbering only happens // after stack probing. // @@ -57,11 +56,11 @@ impl X64ABIMachineSpec { !is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false) }); - smallvec![Inst::StackProbeLoop { + insts.push(Inst::StackProbeLoop { tmp: Writable::from_reg(tmp), frame_size, guard_size, - }] + }); } } @@ -420,8 +419,7 @@ impl ABIMachineSpec for X64ABIMachineSpec { insts } - fn gen_probestack(frame_size: u32) -> SmallInstVec { - let mut insts = SmallVec::new(); + fn gen_probestack(insts: &mut SmallInstVec, frame_size: u32) { insts.push(Inst::imm( OperandSize::Size32, frame_size as u64, @@ -438,10 +436,9 @@ impl ABIMachineSpec for X64ABIMachineSpec { opcode: Opcode::Call, }), }); - insts } - fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec { + fn gen_inline_probestack(insts: &mut SmallInstVec, frame_size: u32, guard_size: u32) { // Unroll at most n consecutive probes, before falling back to using a loop // // This was number was picked because the loop version is 38 bytes long. We can fit @@ -452,9 +449,9 @@ impl ABIMachineSpec for X64ABIMachineSpec { let probe_count = align_to(frame_size, guard_size) / guard_size; if probe_count <= PROBE_MAX_UNROLL { - Self::gen_probestack_unroll(guard_size, probe_count) + Self::gen_probestack_unroll(insts, guard_size, probe_count) } else { - Self::gen_probestack_loop(frame_size, guard_size) + Self::gen_probestack_loop(insts, frame_size, guard_size) } } diff --git a/cranelift/codegen/src/isa/x64/encoding/rex.rs b/cranelift/codegen/src/isa/x64/encoding/rex.rs index 4ca681a1bcf8..41ae596eba94 100644 --- a/cranelift/codegen/src/isa/x64/encoding/rex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs @@ -13,7 +13,7 @@ use crate::{ ir::TrapCode, isa::x64::inst::{ args::{Amode, OperandSize}, - regs, EmitInfo, Inst, LabelUse, + regs, Inst, LabelUse, }, machinst::MachBuffer, }; @@ -293,7 +293,6 @@ impl Default for LegacyPrefixes { /// indicate a 64-bit operation. pub(crate) fn emit_std_enc_mem( sink: &mut MachBuffer, - info: &EmitInfo, prefixes: LegacyPrefixes, opcodes: u32, mut num_opcodes: usize, @@ -315,12 +314,6 @@ pub(crate) fn emit_std_enc_mem( match *mem_e { Amode::ImmReg { simm32, base, .. } => { - // If this is an access based off of RSP, it may trap with a stack overflow if it's the - // first touch of a new stack page. - if base == regs::rsp() && !can_trap && info.flags.enable_probestack() { - sink.add_trap(TrapCode::StackOverflow); - } - // First, the REX byte. let enc_e = int_reg_enc(base); rex.emit_two_op(sink, enc_g, enc_e); @@ -381,12 +374,6 @@ pub(crate) fn emit_std_enc_mem( shift, .. } => { - // If this is an access based off of RSP, it may trap with a stack overflow if it's the - // first touch of a new stack page. - if *reg_base == regs::rsp() && !can_trap && info.flags.enable_probestack() { - sink.add_trap(TrapCode::StackOverflow); - } - let enc_base = int_reg_enc(*reg_base); let enc_index = int_reg_enc(*reg_index); @@ -481,7 +468,6 @@ pub(crate) fn emit_std_enc_enc( pub(crate) fn emit_std_reg_mem( sink: &mut MachBuffer, - info: &EmitInfo, prefixes: LegacyPrefixes, opcodes: u32, num_opcodes: usize, @@ -493,7 +479,6 @@ pub(crate) fn emit_std_reg_mem( let enc_g = reg_enc(reg_g); emit_std_enc_mem( sink, - info, prefixes, opcodes, num_opcodes, diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 6b468d2bc6f9..73e25ca47031 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -171,7 +171,6 @@ pub(crate) fn emit( let amode = addr.finalize(state, sink); emit_std_reg_mem( sink, - info, LegacyPrefixes::None, 0x0FAF, 2, @@ -223,7 +222,6 @@ pub(crate) fn emit( // Here we revert to the "normal" G-E ordering. emit_std_reg_mem( sink, - info, LegacyPrefixes::None, opcode_m, 1, @@ -275,7 +273,6 @@ pub(crate) fn emit( let enc_g = int_reg_enc(src2); emit_std_enc_mem( sink, - info, LegacyPrefixes::None, opcode, 1, @@ -317,17 +314,7 @@ pub(crate) fn emit( } RegMem::Mem { addr: src } => { let amode = src.finalize(state, sink).with_allocs(allocs); - emit_std_reg_mem( - sink, - info, - prefix, - opcode, - num_opcodes, - dst, - &amode, - rex_flags, - 0, - ); + emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst, &amode, rex_flags, 0); } } } @@ -414,7 +401,6 @@ pub(crate) fn emit( let amode = src.finalize(state, sink).with_allocs(allocs); emit_std_enc_mem( sink, - info, prefix, opcode, 1, @@ -459,7 +445,7 @@ pub(crate) fn emit( } RegMem::Mem { addr: src } => { let amode = src.finalize(state, sink).with_allocs(allocs); - emit_std_enc_mem(sink, info, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0); + emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0); } } } @@ -779,7 +765,6 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - info, LegacyPrefixes::None, opcodes, num_opcodes, @@ -798,7 +783,6 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - info, LegacyPrefixes::None, 0x8B, 1, @@ -815,7 +799,6 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - info, LegacyPrefixes::None, 0x8D, 1, @@ -877,7 +860,6 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - info, LegacyPrefixes::None, opcodes, num_opcodes, @@ -913,7 +895,7 @@ pub(crate) fn emit( // 16-bit: MOV r16, r/m16 is 66 (REX.W==0) 89 /r // 32-bit: MOV r32, r/m32 is (REX.W==0) 89 /r // 64-bit: MOV r64, r/m64 is (REX.W==1) 89 /r - emit_std_reg_mem(sink, info, prefix, opcode, 1, src, dst, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, 1, src, dst, rex, 0); } Inst::ShiftR { @@ -1023,7 +1005,7 @@ pub(crate) fn emit( } RegMemImm::Mem { addr } => { let addr = &addr.finalize(state, sink).with_allocs(allocs); - emit_std_reg_mem(sink, info, prefix, opcode_bytes, 2, dst, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode_bytes, 2, dst, addr, rex, 0); } RegMemImm::Imm { .. } => unreachable!(), } @@ -1078,7 +1060,7 @@ pub(crate) fn emit( (OperandSize::Size8, false) => 0x84, (_, false) => 0x85, }; - emit_std_reg_mem(sink, info, prefix, opcode, 1, reg_g, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, 1, reg_g, addr, rex, 0); } RegMemImm::Imm { simm32 } => { @@ -1167,7 +1149,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink).with_allocs(allocs); - emit_std_reg_mem(sink, info, prefix, opcode, 2, dst, addr, rex_flags, 0); + emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex_flags, 0); } } } @@ -1210,10 +1192,6 @@ pub(crate) fn emit( Inst::Push64 { src } => { let src = src.clone().to_reg_mem_imm().with_allocs(allocs); - if info.flags.enable_probestack() { - sink.add_trap(TrapCode::StackOverflow); - } - match src { RegMemImm::Reg { reg } => { let enc_reg = int_reg_enc(reg); @@ -1228,7 +1206,6 @@ pub(crate) fn emit( let addr = &addr.finalize(state, sink); emit_std_enc_mem( sink, - info, LegacyPrefixes::None, 0xFF, 1, @@ -1369,9 +1346,6 @@ pub(crate) fn emit( info: call_info, .. } => { - if info.flags.enable_probestack() { - sink.add_trap(TrapCode::StackOverflow); - } if let Some(s) = state.take_stack_map() { sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s); } @@ -1392,9 +1366,6 @@ pub(crate) fn emit( } => { let dest = dest.with_allocs(allocs); - if info.flags.enable_probestack() { - sink.add_trap(TrapCode::StackOverflow); - } let start_offset = sink.cur_offset(); match dest { RegMem::Reg { reg } => { @@ -1414,7 +1385,6 @@ pub(crate) fn emit( let addr = &addr.finalize(state, sink); emit_std_enc_mem( sink, - info, LegacyPrefixes::None, 0xFF, 1, @@ -1516,7 +1486,6 @@ pub(crate) fn emit( let addr = &addr.finalize(state, sink); emit_std_enc_mem( sink, - info, LegacyPrefixes::None, 0xFF, 1, @@ -1739,7 +1708,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcode, num_opcodes, reg_g, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, num_opcodes, reg_g, addr, rex, 0); } }; } @@ -1765,7 +1734,7 @@ pub(crate) fn emit( RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); // N.B.: bytes_at_end == 1, because of the `imm` byte below. - emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 1); + emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1); } } sink.put1(*imm); @@ -1918,7 +1887,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcode, length, reg_g, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0); } } } @@ -1951,7 +1920,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcode, length, reg_g, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0); } } } @@ -2191,7 +2160,7 @@ pub(crate) fn emit( "No existing way to encode a mem argument in the ModRM r/m field." ); // N.B.: bytes_at_end == 1, because of the `imm` byte below. - emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 1); + emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1); } } sink.put1(*imm); @@ -2217,17 +2186,7 @@ pub(crate) fn emit( _ => unimplemented!("Opcode {:?} not implemented", op), }; let dst = &dst.finalize(state, sink); - emit_std_reg_mem( - sink, - info, - prefix, - opcode, - 2, - src, - dst, - RexFlags::clear_w(), - 0, - ); + emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0); } Inst::XmmToGpr { @@ -2280,7 +2239,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcode, 2, reg_g, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, 2, reg_g, addr, rex, 0); } } } @@ -2303,7 +2262,7 @@ pub(crate) fn emit( } RegMem::Mem { addr } => { let addr = &addr.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 0); + emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 0); } } } @@ -2912,7 +2871,7 @@ pub(crate) fn emit( }; let rex = RexFlags::from((OperandSize::from_ty(*ty), replacement)); let amode = mem.finalize(state, sink); - emit_std_reg_mem(sink, info, prefix, opcodes, 2, replacement, &amode, rex, 0); + emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0); } Inst::AtomicRmwSeq { diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 450a76ffabe9..b82ce632e189 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -500,10 +500,10 @@ pub trait ABIMachineSpec { fn gen_epilogue_frame_restore(flags: &settings::Flags) -> SmallInstVec; /// Generate a probestack call. - fn gen_probestack(_frame_size: u32) -> SmallInstVec; + fn gen_probestack(insts: &mut SmallInstVec, frame_size: u32); /// Generate a inline stack probe. - fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec; + fn gen_inline_probestack(insts: &mut SmallInstVec, frame_size: u32, guard_size: u32); /// Get all clobbered registers that are callee-saved according to the ABI; the result /// contains the registers in a sorted order. @@ -1838,14 +1838,13 @@ impl Callee { .map_or(false, |min_frame| total_stacksize >= min_frame); if needs_probestack { - insts.extend( - if self.flags.probestack_strategy() == ProbestackStrategy::Inline { + match self.flags.probestack_strategy() { + ProbestackStrategy::Inline => { let guard_size = 1 << self.flags.probestack_size_log2(); - M::gen_inline_probestack(total_stacksize, guard_size) - } else { - M::gen_probestack(total_stacksize) - }, - ); + M::gen_inline_probestack(&mut insts, total_stacksize, guard_size) + } + ProbestackStrategy::Outline => M::gen_probestack(&mut insts, total_stacksize), + } } } diff --git a/cranelift/codegen/src/settings.rs b/cranelift/codegen/src/settings.rs index 0a214dfbde2e..4418f990a178 100644 --- a/cranelift/codegen/src/settings.rs +++ b/cranelift/codegen/src/settings.rs @@ -545,7 +545,7 @@ enable_llvm_abi_extensions = false unwind_info = true preserve_frame_pointers = false machine_code_cfg_info = false -enable_probestack = true +enable_probestack = false probestack_func_adjusts_sp = false enable_jump_tables = true enable_heap_access_spectre_mitigation = true diff --git a/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif b/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif new file mode 100644 index 000000000000..38838174b9b9 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif @@ -0,0 +1,71 @@ +test compile precise-output +set enable_probestack=true +set probestack_strategy=inline +; This is the default and is equivalent to a page size of 4096 +set probestack_size_log2=12 +target aarch64 + + +; If the stack size is just one page, we can avoid the stack probe entirely +function %single_page() -> i64 system_v { +ss0 = explicit_slot 2048 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; sub sp, sp, #2048 +; block0: +; mov x0, sp +; add sp, sp, #2048 +; ldp fp, lr, [sp], #16 +; ret + +function %unrolled() -> i64 system_v { +ss0 = explicit_slot 12288 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; movn x16, #4095 ; str wzr, [sp, x16, SXTX] +; movn x16, #8191 ; str wzr, [sp, x16, SXTX] +; movn x16, #12287 ; str wzr, [sp, x16, SXTX] +; sub sp, sp, #12288 +; block0: +; mov x0, sp +; add sp, sp, #12288 +; ldp fp, lr, [sp], #16 +; ret + +function %large() -> i64 system_v { +ss0 = explicit_slot 100000 + +block0: + v1 = stack_addr.i64 ss0 + return v1 +} + +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; movz x16, #0 +; movz w17, #34464 +; movk w17, w17, #1, LSL #16 +; stack_probe_loop x16, x17, #4096 +; movz w16, #34464 +; movk w16, w16, #1, LSL #16 +; sub sp, sp, x16, UXTX +; block0: +; mov x0, sp +; movz w16, #34464 +; movk w16, w16, #1, LSL #16 +; add sp, sp, x16, UXTX +; ldp fp, lr, [sp], #16 +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif index 703d8c4b35b0..e93dc41c429f 100644 --- a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif +++ b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif @@ -177,7 +177,7 @@ block0(v0: i64): ; stp fp, lr, [sp, #-16]! ; mov fp, sp -; movz w16, #6784 ; movk w16, w16, #6, LSL #16 ; add x16, x0, x16, UXTX ; ldr x16, [x16] +; movz w16, #6784 ; movk w16, w16, #6, LSL #16 ; ldr x16, [x0, x16, SXTX] ; add x16, x16, #32 ; subs xzr, sp, x16, UXTX ; b.hs 8 ; udf diff --git a/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif b/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif index 46f612db4299..7706815fa519 100644 --- a/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif +++ b/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif @@ -1,5 +1,6 @@ test compile precise-output set unwind_info=false +set enable_probestack=true target riscv64 function %foo() { diff --git a/cranelift/filetests/filetests/isa/riscv64/stack.clif b/cranelift/filetests/filetests/isa/riscv64/stack.clif index c5073cf78cac..d70250ceb409 100644 --- a/cranelift/filetests/filetests/isa/riscv64/stack.clif +++ b/cranelift/filetests/filetests/isa/riscv64/stack.clif @@ -1,5 +1,6 @@ test compile precise-output set unwind_info=false +set enable_probestack=true target riscv64 function %stack_addr_small() -> i64 { diff --git a/cranelift/filetests/filetests/runtests/inline-probestack.clif b/cranelift/filetests/filetests/runtests/inline-probestack.clif index 21426137d86b..ea5e4241cc4b 100644 --- a/cranelift/filetests/filetests/runtests/inline-probestack.clif +++ b/cranelift/filetests/filetests/runtests/inline-probestack.clif @@ -6,9 +6,11 @@ set probestack_strategy=inline ; This is the default and is equivalent to a page size of 4096 set probestack_size_log2=12 target x86_64 +target aarch64 ; Test also with 64k pages set probestack_size_log2=16 target x86_64 +target aarch64 ; Create a huge stack slot (1MB), way larger than PAGE_SIZE and touch the end of it. ; This guarantees that we bypass the guard page, cause a page fault the OS isn't expecting diff --git a/cranelift/fuzzgen/src/lib.rs b/cranelift/fuzzgen/src/lib.rs index 04b27cc3d259..d7df6df082a0 100644 --- a/cranelift/fuzzgen/src/lib.rs +++ b/cranelift/fuzzgen/src/lib.rs @@ -266,10 +266,9 @@ where builder.set(flag_name, value.as_str())?; } - // Optionally test inline stackprobes on x86 - // TODO: inline stack probes are not available on AArch64 + // Optionally test inline stackprobes on supported platforms // TODO: Test outlined stack probes. - if cfg!(target_arch = "x86_64") && bool::arbitrary(self.u)? { + if supports_inline_probestack() && bool::arbitrary(self.u)? { builder.enable("enable_probestack")?; builder.set("probestack_strategy", "inline")?; @@ -300,7 +299,11 @@ where // into compilation anywhere, we leave it on unconditionally to make sure the generation doesn't panic. builder.enable("machine_code_cfg_info")?; - Ok(Flags::new(builder)) + return Ok(Flags::new(builder)); + + fn supports_inline_probestack() -> bool { + cfg!(target_arch = "x86_64") || cfg!(target_arch = "aarch64") + } } pub fn generate_test(mut self) -> Result { diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index ed9c4f3dd002..e8321baf7049 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -7,6 +7,7 @@ use std::fmt; #[cfg(feature = "cache")] use std::path::Path; use std::sync::Arc; +use target_lexicon::Architecture; use wasmparser::WasmFeatures; #[cfg(feature = "cache")] use wasmtime_cache::CacheConfig; @@ -1470,8 +1471,6 @@ impl Config { #[cfg(compiler)] pub(crate) fn build_compiler(&mut self) -> Result> { - use target_lexicon::Architecture; - let mut compiler = match self.compiler_config.strategy { Strategy::Auto | Strategy::Cranelift => wasmtime_cranelift::builder(), }; @@ -1480,35 +1479,30 @@ impl Config { compiler.target(target.clone())?; } - // On x86-64 targets, we enable stack probing by default. + // If probestack is enabled for a target, Wasmtime will always use the + // inline strategy which doesn't require us to define a `__probestack` + // function or similar. + self.compiler_config + .settings + .insert("probestack_strategy".into(), "inline".into()); + + let host = target_lexicon::Triple::host(); + let target = self.compiler_config.target.as_ref().unwrap_or(&host); + + // On supported targets, we enable stack probing by default. // This is required on Windows because of the way Windows // commits its stacks, but it's also a good idea on other // platforms to ensure guard pages are hit for large frame // sizes. - if self - .compiler_config - .target - .as_ref() - .map(|t| t.architecture == Architecture::X86_64) - .unwrap_or(cfg!(target_arch = "x86_64")) - { + if probestack_supported(target.architecture) { self.compiler_config .flags .insert("enable_probestack".into()); - self.compiler_config - .settings - .insert("probestack_strategy".into(), "inline".into()); } if self.native_unwind_info || - // Windows always needs unwind info, since it is part of the ABI. - self - .compiler_config - .target - .as_ref() - .map_or(cfg!(target_os = "windows"), |target| { - target.operating_system == target_lexicon::OperatingSystem::Windows - }) + // Windows always needs unwind info, since it is part of the ABI. + target.operating_system == target_lexicon::OperatingSystem::Windows { if !self .compiler_config @@ -1905,3 +1899,10 @@ impl PoolingAllocationConfig { self } } + +pub(crate) fn probestack_supported(arch: Architecture) -> bool { + matches!( + arch, + Architecture::X86_64 | Architecture::Aarch64(_) | Architecture::Riscv64(_) + ) +} diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index 24961693bc6f..dfb386bf5708 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -359,8 +359,6 @@ impl Engine { flag: &str, value: &FlagValue, ) -> Result<(), String> { - use target_lexicon::Architecture; - let target = self.target(); let ok = match flag { // These settings must all have be enabled, since their value @@ -369,11 +367,8 @@ impl Engine { "avoid_div_traps" => *value == FlagValue::Bool(true), "libcall_call_conv" => *value == FlagValue::Enum("isa_default".into()), "preserve_frame_pointers" => *value == FlagValue::Bool(true), - - // On x86-64 targets, stack probing is enabled by default. - "enable_probestack" => *value == FlagValue::Bool(target.architecture == Architecture::X86_64), - // On x86-64 targets, the default stack probing strategy is inline. - "probestack_strategy" => *value == FlagValue::Enum(if target.architecture == Architecture::X86_64 { "inline" } else { "outline" }.into()), + "enable_probestack" => *value == FlagValue::Bool(crate::config::probestack_supported(target.architecture)), + "probestack_strategy" => *value == FlagValue::Enum("inline".into()), // Features wasmtime doesn't use should all be disabled, since // otherwise if they are enabled it could change the behavior of diff --git a/tests/host_segfault.rs b/tests/host_segfault.rs index 6adac707e5d6..404fb4947028 100644 --- a/tests/host_segfault.rs +++ b/tests/host_segfault.rs @@ -144,6 +144,16 @@ fn main() { }, true, ), + ( + "overrun 8k with misconfigured host", + || overrun_with_big_module(8 << 10), + true, + ), + ( + "overrun 32k with misconfigured host", + || overrun_with_big_module(32 << 10), + true, + ), #[cfg(not(any(target_arch = "riscv64")))] // Due to `InstanceAllocationStrategy::pooling()` trying to alloc more than 6000G memory space. // https://gitlab.com/qemu-project/qemu/-/issues/1214 @@ -178,6 +188,7 @@ fn main() { } Err(_) => { for (name, _test, stack_overflow) in tests { + println!("running {name}"); run_test(name, *stack_overflow); } } @@ -245,7 +256,7 @@ fn is_stack_overflow(status: &ExitStatus, stderr: &str) -> bool { use std::os::unix::prelude::*; // The main thread might overflow or it might be from a fiber stack (SIGSEGV/SIGBUS) - stderr.contains("thread 'main' has overflowed its stack") + stderr.contains("has overflowed its stack") || match status.signal() { Some(libc::SIGSEGV) | Some(libc::SIGBUS) => true, _ => false, @@ -267,3 +278,47 @@ fn is_stack_overflow(status: &ExitStatus, _stderr: &str) -> bool { _ => false, } } + +fn overrun_with_big_module(approx_stack: usize) { + // Each call to `$get` produces ten 8-byte values which need to be saved + // onto the stack, so divide `approx_stack` by 80 to get + // a rough number of calls to consume `approx_stack` stack. + let n = approx_stack / 10 / 8; + + let mut s = String::new(); + s.push_str("(module\n"); + s.push_str("(func $big_stack\n"); + for _ in 0..n { + s.push_str("call $get\n"); + } + for _ in 0..n { + s.push_str("call $take\n"); + } + s.push_str(")\n"); + s.push_str("(func $get (result i64 i64 i64 i64 i64 i64 i64 i64 i64 i64) call $big_stack unreachable)\n"); + s.push_str("(func $take (param i64 i64 i64 i64 i64 i64 i64 i64 i64 i64) unreachable)\n"); + s.push_str("(func (export \"\") call $big_stack)\n"); + s.push_str(")\n"); + + // Give 100MB of stack to wasm, representing a misconfigured host. Run the + // actual module on a 2MB stack in a child thread to guarantee that the + // module here will overrun the stack. This should deterministically hit the + // guard page. + let mut config = Config::default(); + config.max_wasm_stack(100 << 20).async_stack_size(100 << 20); + let engine = Engine::new(&config).unwrap(); + let module = Module::new(&engine, &s).unwrap(); + let mut store = Store::new(&engine, ()); + let i = Instance::new(&mut store, &module, &[]).unwrap(); + let f = i.get_typed_func::<(), ()>(&mut store, "").unwrap(); + std::thread::Builder::new() + .stack_size(2 << 20) + .spawn(move || { + println!("{CONFIRM}"); + f.call(&mut store, ()).unwrap(); + }) + .unwrap() + .join() + .unwrap(); + unreachable!(); +}