diff --git a/cranelift/codegen/meta/src/shared/settings.rs b/cranelift/codegen/meta/src/shared/settings.rs
index d006d38815c2..b9fbef55c9e2 100644
--- a/cranelift/codegen/meta/src/shared/settings.rs
+++ b/cranelift/codegen/meta/src/shared/settings.rs
@@ -273,7 +273,7 @@ pub(crate) fn define() -> SettingGroup {
         "enable_probestack",
         "Enable the use of stack probes for supported calling conventions.",
         "",
-        true,
+        false,
     );
 
     settings.add_bool(
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index be6d2d978b04..d57a804ef7d7 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -631,14 +631,56 @@ impl ABIMachineSpec for AArch64MachineDeps {
         insts
     }
 
-    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
+    fn gen_probestack(_insts: &mut SmallInstVec<Self::I>, _: u32) {
         // TODO: implement if we ever require stack probes on an AArch64 host
         // (unlikely unless Lucet is ported)
-        smallvec![]
+        unimplemented!("Stack probing is unimplemented on AArch64");
     }
 
-    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
-        unimplemented!("Inline stack probing is unimplemented on AArch64");
+    fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
+        // The stack probe loop currently takes 6 instructions and each inline
+        // probe takes 2 (ish, these numbers sort of depend on the constants).
+        // Set this to 3 to keep the max size of the probe to 6 instructions.
+        const PROBE_MAX_UNROLL: u32 = 3;
+
+        let probe_count = align_to(frame_size, guard_size) / guard_size;
+        if probe_count <= PROBE_MAX_UNROLL {
+            // When manually unrolling stick an instruction that stores 0 at a
+            // constant offset relative to the stack pointer. This will
+            // turn into something like `movn tmp, #n ; stur xzr [sp, tmp]`.
+            //
+            // Note that this may actually store beyond the stack size for the
+            // last item but that's ok since it's unused stack space and if
+            // that faults accidentally we're so close to faulting it shouldn't
+            // make too much difference to fault there.
+            insts.reserve(probe_count as usize);
+            for i in 0..probe_count {
+                let offset = (guard_size * (i + 1)) as i64;
+                insts.push(Self::gen_store_stack(
+                    StackAMode::SPOffset(-offset, I8),
+                    zero_reg(),
+                    I32,
+                ));
+            }
+        } else {
+            // The non-unrolled version uses two temporary registers. The
+            // `start` contains the current offset from sp and counts downwards
+            // during the loop by increments of `guard_size`. The `end` is
+            // the size of the frame and where we stop.
+            //
+            // Note that this emission is all post-regalloc so it should be ok
+            // to use the temporary registers here as input/output as the loop
+            // itself is not allowed to use the registers.
+            let start = writable_spilltmp_reg();
+            let end = writable_tmp2_reg();
+            insts.extend(Inst::load_constant(start, 0));
+            insts.extend(Inst::load_constant(end, frame_size.into()));
+            insts.push(Inst::StackProbeLoop {
+                start,
+                end: end.to_reg(),
+                step: Imm12::maybe_from_u64(guard_size.into()).unwrap(),
+            });
+        }
     }
 
     // Returns stack bytes used as well as instructions. Does not adjust
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 46e8e842d421..b61ca529c997 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -935,7 +935,16 @@
 
        ;; A dummy use, useful to keep a value alive.
        (DummyUse
-        (reg Reg))))
+        (reg Reg))
+
+       ;; Emits an inline stack probe loop.
+       ;;
+       ;; Note that this is emitted post-regalloc so `start` and `end` can be
+       ;; temporary registers such as the spilltmp and tmp2 registers. This also
+       ;; means that the internal codegen can't use these registers.
+       (StackProbeLoop (start WritableReg)
+                       (end Reg)
+                       (step Imm12))))
 
 ;; An ALU operation. This can be paired with several instruction formats
 ;; below (see `Inst`) in any combination.
@@ -1134,8 +1143,7 @@
         ;; [crate::isa::aarch64::abi](the ABI module) for more details.
         (NominalSPOffset
          (off i64)
-         (ty Type))
-))
+         (ty Type))))
 
 (type PairAMode extern (enum))
 (type FPUOpRI extern (enum))
@@ -2242,7 +2250,7 @@
       (let ((dst WritableReg (temp_writable_reg $I64)))
         (ConsumesFlags.ConsumesFlagsReturnsResultWithProducer (MInst.CSet dst cond) dst)))
 
-;; Helper for constructing `csetm` instructions. 
+;; Helper for constructing `csetm` instructions.
 (decl csetm (Cond) ConsumesFlags)
 (rule (csetm cond)
       (let ((dst WritableReg (temp_writable_reg $I64)))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index d2277b4ba6dd..0280a4c00f8b 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -63,21 +63,14 @@ pub fn mem_finalize(
                 (smallvec![], mem)
             } else {
                 let tmp = writable_spilltmp_reg();
-                let mut const_insts = Inst::load_constant(tmp, off as u64);
-                // N.B.: we must use AluRRRExtend because AluRRR uses the "shifted register" form
-                // (AluRRRShift) instead, which interprets register 31 as the zero reg, not SP. SP
-                // is a valid base (for SPOffset) which we must handle here.
-                // Also, SP needs to be the first arg, not second.
-                let add_inst = Inst::AluRRRExtend {
-                    alu_op: ALUOp::Add,
-                    size: OperandSize::Size64,
-                    rd: tmp,
-                    rn: basereg,
-                    rm: tmp.to_reg(),
-                    extendop: ExtendOp::UXTX,
-                };
-                const_insts.push(add_inst);
-                (const_insts, AMode::reg(tmp.to_reg()))
+                (
+                    Inst::load_constant(tmp, off as u64),
+                    AMode::RegExtended {
+                        rn: basereg,
+                        rm: tmp.to_reg(),
+                        extendop: ExtendOp::SXTX,
+                    },
+                )
             }
         }
 
@@ -3424,6 +3417,72 @@ impl MachInstEmit for Inst {
             }
 
             &Inst::DummyUse { .. } => {}
+
+            &Inst::StackProbeLoop { start, end, step } => {
+                assert!(emit_info.0.enable_probestack());
+                let start = allocs.next_writable(start);
+                let end = allocs.next(end);
+
+                // The loop generated here uses `start` as a counter register to
+                // count backwards until negating it exceeds `end`. In other
+                // words `start` is an offset from `sp` we're testing where
+                // `end` is the max size we need to test. The loop looks like:
+                //
+                //      loop_start:
+                //          sub start, start, #step
+                //          stur xzr, [sp, start]
+                //          cmn start, end
+                //          br.gt loop_start
+                //      loop_end:
+                //
+                // Note that this loop cannot use the spilltmp and tmp2
+                // registers as those are currently used as the input to this
+                // loop when generating the instruction. This means that some
+                // more flavorful address modes and lowerings need to be
+                // avoided.
+                //
+                // Perhaps someone more clever than I can figure out how to use
+                // `subs` or the like and skip the `cmn`, but I can't figure it
+                // out at this time.
+
+                let loop_start = sink.get_label();
+                sink.bind_label(loop_start);
+
+                Inst::AluRRImm12 {
+                    alu_op: ALUOp::Sub,
+                    size: OperandSize::Size64,
+                    rd: start,
+                    rn: start.to_reg(),
+                    imm12: step,
+                }
+                .emit(&[], sink, emit_info, state);
+                Inst::Store32 {
+                    rd: regs::zero_reg(),
+                    mem: AMode::RegReg {
+                        rn: regs::stack_reg(),
+                        rm: start.to_reg(),
+                    },
+                    flags: MemFlags::trusted(),
+                }
+                .emit(&[], sink, emit_info, state);
+                Inst::AluRRR {
+                    alu_op: ALUOp::AddS,
+                    size: OperandSize::Size64,
+                    rd: regs::writable_zero_reg(),
+                    rn: start.to_reg(),
+                    rm: end,
+                }
+                .emit(&[], sink, emit_info, state);
+
+                let loop_end = sink.get_label();
+                Inst::CondBr {
+                    taken: BranchTarget::Label(loop_start),
+                    not_taken: BranchTarget::Label(loop_end),
+                    kind: CondBrKind::Cond(Cond::Gt),
+                }
+                .emit(&[], sink, emit_info, state);
+                sink.bind_label(loop_end);
+            }
         }
 
         let end_off = sink.cur_offset();
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 8e06737f4617..b5e926b01197 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1789,8 +1789,8 @@ fn test_aarch64_binemit() {
             mem: AMode::FPOffset { off: 32768, ty: I8 },
             flags: MemFlags::trusted(),
         },
-        "100090D2B063308B010240F9",
-        "movz x16, #32768 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "100090D2A1EB70F8",
+        "movz x16, #32768 ; ldr x1, [fp, x16, SXTX]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1801,8 +1801,8 @@ fn test_aarch64_binemit() {
             },
             flags: MemFlags::trusted(),
         },
-        "F0FF8F92B063308B010240F9",
-        "movn x16, #32767 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "F0FF8F92A1EB70F8",
+        "movn x16, #32767 ; ldr x1, [fp, x16, SXTX]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1813,8 +1813,8 @@ fn test_aarch64_binemit() {
             }, // 2^20
             flags: MemFlags::trusted(),
         },
-        "1002A0D2B063308B010240F9",
-        "movz x16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "1002A0D2A1EB70F8",
+        "movz x16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
     ));
     insns.push((
         Inst::ULoad64 {
@@ -1825,8 +1825,8 @@ fn test_aarch64_binemit() {
             }, // 2^20 + 1
             flags: MemFlags::trusted(),
         },
-        "300080521002A072B063308B010240F9",
-        "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; add x16, fp, x16, UXTX ; ldr x1, [x16]",
+        "300080521002A072A1EB70F8",
+        "movz w16, #1 ; movk w16, w16, #16, LSL #16 ; ldr x1, [fp, x16, SXTX]",
     ));
 
     insns.push((
@@ -1867,8 +1867,8 @@ fn test_aarch64_binemit() {
             },
             flags: MemFlags::trusted(),
         },
-        "1002A0D2F060308B010240F9",
-        "movz x16, #16, LSL #16 ; add x16, x7, x16, UXTX ; ldr x1, [x16]",
+        "1002A0D2E1E870F8",
+        "movz x16, #16, LSL #16 ; ldr x1, [x7, x16, SXTX]",
     ));
 
     insns.push((
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 2acef7e191e3..3ed883a49455 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1103,6 +1103,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
         &Inst::DummyUse { reg } => {
             collector.reg_use(reg);
         }
+        &Inst::StackProbeLoop { start, end, .. } => {
+            collector.reg_early_def(start);
+            collector.reg_use(end);
+        }
     }
 }
 
@@ -2887,6 +2891,12 @@ impl Inst {
                 let reg = pretty_print_reg(reg, allocs);
                 format!("dummy_use {}", reg)
             }
+            &Inst::StackProbeLoop { start, end, step } => {
+                let start = pretty_print_reg(start.to_reg(), allocs);
+                let end = pretty_print_reg(end, allocs);
+                let step = step.pretty_print(0, allocs);
+                format!("stack_probe_loop {start}, {end}, {step}")
+            }
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs
index 7aeee8ba57d7..995a52fbc09d 100644
--- a/cranelift/codegen/src/isa/riscv64/abi.rs
+++ b/cranelift/codegen/src/isa/riscv64/abi.rs
@@ -360,8 +360,7 @@ impl ABIMachineSpec for Riscv64MachineDeps {
         insts
     }
 
-    fn gen_probestack(frame_size: u32) -> SmallInstVec<Self::I> {
-        let mut insts = SmallVec::new();
+    fn gen_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32) {
         insts.extend(Inst::load_constant_u32(writable_a0(), frame_size as u64));
         insts.push(Inst::Call {
             info: Box::new(CallInfo {
@@ -377,7 +376,6 @@ impl ABIMachineSpec for Riscv64MachineDeps {
                 caller_callconv: CallConv::SystemV,
             }),
         });
-        insts
     }
     // Returns stack bytes used as well as instructions. Does not adjust
     // nominal SP offset; abi_impl generic code will do that.
@@ -632,16 +630,16 @@ impl ABIMachineSpec for Riscv64MachineDeps {
         || fixed_frame_storage_size > 0
     }
 
-    fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec<Self::I> {
+    fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
         // Unroll at most n consecutive probes, before falling back to using a loop
         const PROBE_MAX_UNROLL: u32 = 3;
         // Number of probes that we need to perform
         let probe_count = align_to(frame_size, guard_size) / guard_size;
 
         if probe_count <= PROBE_MAX_UNROLL {
-            Self::gen_probestack_unroll(guard_size, probe_count)
+            Self::gen_probestack_unroll(insts, guard_size, probe_count)
         } else {
-            Self::gen_probestack_loop(guard_size, probe_count)
+            Self::gen_probestack_loop(insts, guard_size, probe_count)
         }
     }
 }
@@ -697,8 +695,8 @@ fn compute_clobber_size(clobbers: &[Writable<RealReg>]) -> u32 {
 }
 
 impl Riscv64MachineDeps {
-    fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
-        let mut insts = SmallVec::with_capacity(probe_count as usize);
+    fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
+        insts.reserve(probe_count as usize);
         for i in 0..probe_count {
             let offset = (guard_size * (i + 1)) as i64;
             insts.push(Self::gen_store_stack(
@@ -707,13 +705,13 @@ impl Riscv64MachineDeps {
                 I32,
             ));
         }
-        insts
     }
-    fn gen_probestack_loop(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
-        smallvec![Inst::StackProbeLoop {
+
+    fn gen_probestack_loop(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
+        insts.push(Inst::StackProbeLoop {
             guard_size,
             probe_count,
             tmp: Writable::from_reg(x_reg(28)), // t3
-        }]
+        });
     }
 }
diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs
index 45e051f0216a..f123a4896697 100644
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -569,13 +569,17 @@ impl ABIMachineSpec for S390xMachineDeps {
         SmallVec::new()
     }
 
-    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
+    fn gen_probestack(_insts: &mut SmallInstVec<Self::I>, _: u32) {
         // TODO: implement if we ever require stack probes on an s390x host
         // (unlikely unless Lucet is ported)
-        smallvec![]
+        unimplemented!("Stack probing is unimplemented on S390x");
     }
 
-    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I> {
+    fn gen_inline_probestack(
+        _insts: &mut SmallInstVec<Self::I>,
+        _frame_size: u32,
+        _guard_size: u32,
+    ) {
         unimplemented!("Inline stack probing is unimplemented on S390x");
     }
 
diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs
index 91bde772c6a0..d2b9d6e18ee0 100644
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -30,8 +30,8 @@ pub(crate) type X64Caller = Caller<X64ABIMachineSpec>;
 pub struct X64ABIMachineSpec;
 
 impl X64ABIMachineSpec {
-    fn gen_probestack_unroll(guard_size: u32, probe_count: u32) -> SmallInstVec<Inst> {
-        let mut insts = SmallVec::with_capacity(probe_count as usize);
+    fn gen_probestack_unroll(insts: &mut SmallInstVec<Inst>, guard_size: u32, probe_count: u32) {
+        insts.reserve(probe_count as usize);
         for i in 0..probe_count {
             let offset = (guard_size * (i + 1)) as i64;
 
@@ -43,9 +43,8 @@ impl X64ABIMachineSpec {
                 I32,
             ));
         }
-        insts
     }
-    fn gen_probestack_loop(frame_size: u32, guard_size: u32) -> SmallInstVec<Inst> {
+    fn gen_probestack_loop(insts: &mut SmallInstVec<Inst>, frame_size: u32, guard_size: u32) {
         // We have to use a caller saved register since clobbering only happens
         // after stack probing.
         //
@@ -57,11 +56,11 @@ impl X64ABIMachineSpec {
             !is_callee_save_systemv(real_reg, false) && !is_callee_save_fastcall(real_reg, false)
         });
 
-        smallvec![Inst::StackProbeLoop {
+        insts.push(Inst::StackProbeLoop {
             tmp: Writable::from_reg(tmp),
             frame_size,
             guard_size,
-        }]
+        });
     }
 }
 
@@ -420,8 +419,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         insts
     }
 
-    fn gen_probestack(frame_size: u32) -> SmallInstVec<Self::I> {
-        let mut insts = SmallVec::new();
+    fn gen_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32) {
         insts.push(Inst::imm(
             OperandSize::Size32,
             frame_size as u64,
@@ -438,10 +436,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                 opcode: Opcode::Call,
             }),
         });
-        insts
     }
 
-    fn gen_inline_probestack(frame_size: u32, guard_size: u32) -> SmallInstVec<Self::I> {
+    fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32) {
         // Unroll at most n consecutive probes, before falling back to using a loop
         //
         // This was number was picked because the loop version is 38 bytes long. We can fit
@@ -452,9 +449,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
         let probe_count = align_to(frame_size, guard_size) / guard_size;
 
         if probe_count <= PROBE_MAX_UNROLL {
-            Self::gen_probestack_unroll(guard_size, probe_count)
+            Self::gen_probestack_unroll(insts, guard_size, probe_count)
         } else {
-            Self::gen_probestack_loop(frame_size, guard_size)
+            Self::gen_probestack_loop(insts, frame_size, guard_size)
         }
     }
 
diff --git a/cranelift/codegen/src/isa/x64/encoding/rex.rs b/cranelift/codegen/src/isa/x64/encoding/rex.rs
index 4ca681a1bcf8..41ae596eba94 100644
--- a/cranelift/codegen/src/isa/x64/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs
@@ -13,7 +13,7 @@ use crate::{
     ir::TrapCode,
     isa::x64::inst::{
         args::{Amode, OperandSize},
-        regs, EmitInfo, Inst, LabelUse,
+        regs, Inst, LabelUse,
     },
     machinst::MachBuffer,
 };
@@ -293,7 +293,6 @@ impl Default for LegacyPrefixes {
 /// indicate a 64-bit operation.
 pub(crate) fn emit_std_enc_mem(
     sink: &mut MachBuffer<Inst>,
-    info: &EmitInfo,
     prefixes: LegacyPrefixes,
     opcodes: u32,
     mut num_opcodes: usize,
@@ -315,12 +314,6 @@ pub(crate) fn emit_std_enc_mem(
 
     match *mem_e {
         Amode::ImmReg { simm32, base, .. } => {
-            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
-            // first touch of a new stack page.
-            if base == regs::rsp() && !can_trap && info.flags.enable_probestack() {
-                sink.add_trap(TrapCode::StackOverflow);
-            }
-
             // First, the REX byte.
             let enc_e = int_reg_enc(base);
             rex.emit_two_op(sink, enc_g, enc_e);
@@ -381,12 +374,6 @@ pub(crate) fn emit_std_enc_mem(
             shift,
             ..
         } => {
-            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
-            // first touch of a new stack page.
-            if *reg_base == regs::rsp() && !can_trap && info.flags.enable_probestack() {
-                sink.add_trap(TrapCode::StackOverflow);
-            }
-
             let enc_base = int_reg_enc(*reg_base);
             let enc_index = int_reg_enc(*reg_index);
 
@@ -481,7 +468,6 @@ pub(crate) fn emit_std_enc_enc(
 
 pub(crate) fn emit_std_reg_mem(
     sink: &mut MachBuffer<Inst>,
-    info: &EmitInfo,
     prefixes: LegacyPrefixes,
     opcodes: u32,
     num_opcodes: usize,
@@ -493,7 +479,6 @@ pub(crate) fn emit_std_reg_mem(
     let enc_g = reg_enc(reg_g);
     emit_std_enc_mem(
         sink,
-        info,
         prefixes,
         opcodes,
         num_opcodes,
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 6b468d2bc6f9..73e25ca47031 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -171,7 +171,6 @@ pub(crate) fn emit(
                         let amode = addr.finalize(state, sink);
                         emit_std_reg_mem(
                             sink,
-                            info,
                             LegacyPrefixes::None,
                             0x0FAF,
                             2,
@@ -223,7 +222,6 @@ pub(crate) fn emit(
                         // Here we revert to the "normal" G-E ordering.
                         emit_std_reg_mem(
                             sink,
-                            info,
                             LegacyPrefixes::None,
                             opcode_m,
                             1,
@@ -275,7 +273,6 @@ pub(crate) fn emit(
             let enc_g = int_reg_enc(src2);
             emit_std_enc_mem(
                 sink,
-                info,
                 LegacyPrefixes::None,
                 opcode,
                 1,
@@ -317,17 +314,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr: src } => {
                     let amode = src.finalize(state, sink).with_allocs(allocs);
-                    emit_std_reg_mem(
-                        sink,
-                        info,
-                        prefix,
-                        opcode,
-                        num_opcodes,
-                        dst,
-                        &amode,
-                        rex_flags,
-                        0,
-                    );
+                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst, &amode, rex_flags, 0);
                 }
             }
         }
@@ -414,7 +401,6 @@ pub(crate) fn emit(
                     let amode = src.finalize(state, sink).with_allocs(allocs);
                     emit_std_enc_mem(
                         sink,
-                        info,
                         prefix,
                         opcode,
                         1,
@@ -459,7 +445,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr: src } => {
                     let amode = src.finalize(state, sink).with_allocs(allocs);
-                    emit_std_enc_mem(sink, info, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
+                    emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
                 }
             }
         }
@@ -779,7 +765,6 @@ pub(crate) fn emit(
 
                     emit_std_reg_mem(
                         sink,
-                        info,
                         LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
@@ -798,7 +783,6 @@ pub(crate) fn emit(
 
             emit_std_reg_mem(
                 sink,
-                info,
                 LegacyPrefixes::None,
                 0x8B,
                 1,
@@ -815,7 +799,6 @@ pub(crate) fn emit(
 
             emit_std_reg_mem(
                 sink,
-                info,
                 LegacyPrefixes::None,
                 0x8D,
                 1,
@@ -877,7 +860,6 @@ pub(crate) fn emit(
 
                     emit_std_reg_mem(
                         sink,
-                        info,
                         LegacyPrefixes::None,
                         opcodes,
                         num_opcodes,
@@ -913,7 +895,7 @@ pub(crate) fn emit(
             // 16-bit: MOV r16, r/m16 is 66 (REX.W==0) 89 /r
             // 32-bit: MOV r32, r/m32 is (REX.W==0) 89 /r
             // 64-bit: MOV r64, r/m64 is (REX.W==1) 89 /r
-            emit_std_reg_mem(sink, info, prefix, opcode, 1, src, dst, rex, 0);
+            emit_std_reg_mem(sink, prefix, opcode, 1, src, dst, rex, 0);
         }
 
         Inst::ShiftR {
@@ -1023,7 +1005,7 @@ pub(crate) fn emit(
                     }
                     RegMemImm::Mem { addr } => {
                         let addr = &addr.finalize(state, sink).with_allocs(allocs);
-                        emit_std_reg_mem(sink, info, prefix, opcode_bytes, 2, dst, addr, rex, 0);
+                        emit_std_reg_mem(sink, prefix, opcode_bytes, 2, dst, addr, rex, 0);
                     }
                     RegMemImm::Imm { .. } => unreachable!(),
                 }
@@ -1078,7 +1060,7 @@ pub(crate) fn emit(
                         (OperandSize::Size8, false) => 0x84,
                         (_, false) => 0x85,
                     };
-                    emit_std_reg_mem(sink, info, prefix, opcode, 1, reg_g, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, 1, reg_g, addr, rex, 0);
                 }
 
                 RegMemImm::Imm { simm32 } => {
@@ -1167,7 +1149,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink).with_allocs(allocs);
-                    emit_std_reg_mem(sink, info, prefix, opcode, 2, dst, addr, rex_flags, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex_flags, 0);
                 }
             }
         }
@@ -1210,10 +1192,6 @@ pub(crate) fn emit(
         Inst::Push64 { src } => {
             let src = src.clone().to_reg_mem_imm().with_allocs(allocs);
 
-            if info.flags.enable_probestack() {
-                sink.add_trap(TrapCode::StackOverflow);
-            }
-
             match src {
                 RegMemImm::Reg { reg } => {
                     let enc_reg = int_reg_enc(reg);
@@ -1228,7 +1206,6 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state, sink);
                     emit_std_enc_mem(
                         sink,
-                        info,
                         LegacyPrefixes::None,
                         0xFF,
                         1,
@@ -1369,9 +1346,6 @@ pub(crate) fn emit(
             info: call_info,
             ..
         } => {
-            if info.flags.enable_probestack() {
-                sink.add_trap(TrapCode::StackOverflow);
-            }
             if let Some(s) = state.take_stack_map() {
                 sink.add_stack_map(StackMapExtent::UpcomingBytes(5), s);
             }
@@ -1392,9 +1366,6 @@ pub(crate) fn emit(
         } => {
             let dest = dest.with_allocs(allocs);
 
-            if info.flags.enable_probestack() {
-                sink.add_trap(TrapCode::StackOverflow);
-            }
             let start_offset = sink.cur_offset();
             match dest {
                 RegMem::Reg { reg } => {
@@ -1414,7 +1385,6 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state, sink);
                     emit_std_enc_mem(
                         sink,
-                        info,
                         LegacyPrefixes::None,
                         0xFF,
                         1,
@@ -1516,7 +1486,6 @@ pub(crate) fn emit(
                     let addr = &addr.finalize(state, sink);
                     emit_std_enc_mem(
                         sink,
-                        info,
                         LegacyPrefixes::None,
                         0xFF,
                         1,
@@ -1739,7 +1708,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, info, prefix, opcode, num_opcodes, reg_g, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, reg_g, addr, rex, 0);
                 }
             };
         }
@@ -1765,7 +1734,7 @@ pub(crate) fn emit(
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
                     // N.B.: bytes_at_end == 1, because of the `imm` byte below.
-                    emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 1);
+                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
                 }
             }
             sink.put1(*imm);
@@ -1918,7 +1887,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, info, prefix, opcode, length, reg_g, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
                 }
             }
         }
@@ -1951,7 +1920,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, info, prefix, opcode, length, reg_g, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
                 }
             }
         }
@@ -2191,7 +2160,7 @@ pub(crate) fn emit(
                         "No existing way to encode a mem argument in the ModRM r/m field."
                     );
                     // N.B.: bytes_at_end == 1, because of the `imm` byte below.
-                    emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 1);
+                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
                 }
             }
             sink.put1(*imm);
@@ -2217,17 +2186,7 @@ pub(crate) fn emit(
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let dst = &dst.finalize(state, sink);
-            emit_std_reg_mem(
-                sink,
-                info,
-                prefix,
-                opcode,
-                2,
-                src,
-                dst,
-                RexFlags::clear_w(),
-                0,
-            );
+            emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
         }
 
         Inst::XmmToGpr {
@@ -2280,7 +2239,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, info, prefix, opcode, 2, reg_g, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, 2, reg_g, addr, rex, 0);
                 }
             }
         }
@@ -2303,7 +2262,7 @@ pub(crate) fn emit(
                 }
                 RegMem::Mem { addr } => {
                     let addr = &addr.finalize(state, sink);
-                    emit_std_reg_mem(sink, info, prefix, opcode, len, dst, addr, rex, 0);
+                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 0);
                 }
             }
         }
@@ -2912,7 +2871,7 @@ pub(crate) fn emit(
             };
             let rex = RexFlags::from((OperandSize::from_ty(*ty), replacement));
             let amode = mem.finalize(state, sink);
-            emit_std_reg_mem(sink, info, prefix, opcodes, 2, replacement, &amode, rex, 0);
+            emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0);
         }
 
         Inst::AtomicRmwSeq {
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index 450a76ffabe9..b82ce632e189 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -500,10 +500,10 @@ pub trait ABIMachineSpec {
     fn gen_epilogue_frame_restore(flags: &settings::Flags) -> SmallInstVec<Self::I>;
 
     /// Generate a probestack call.
-    fn gen_probestack(_frame_size: u32) -> SmallInstVec<Self::I>;
+    fn gen_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32);
 
     /// Generate a inline stack probe.
-    fn gen_inline_probestack(_frame_size: u32, _guard_size: u32) -> SmallInstVec<Self::I>;
+    fn gen_inline_probestack(insts: &mut SmallInstVec<Self::I>, frame_size: u32, guard_size: u32);
 
     /// Get all clobbered registers that are callee-saved according to the ABI; the result
     /// contains the registers in a sorted order.
@@ -1838,14 +1838,13 @@ impl<M: ABIMachineSpec> Callee<M> {
                 .map_or(false, |min_frame| total_stacksize >= min_frame);
 
             if needs_probestack {
-                insts.extend(
-                    if self.flags.probestack_strategy() == ProbestackStrategy::Inline {
+                match self.flags.probestack_strategy() {
+                    ProbestackStrategy::Inline => {
                         let guard_size = 1 << self.flags.probestack_size_log2();
-                        M::gen_inline_probestack(total_stacksize, guard_size)
-                    } else {
-                        M::gen_probestack(total_stacksize)
-                    },
-                );
+                        M::gen_inline_probestack(&mut insts, total_stacksize, guard_size)
+                    }
+                    ProbestackStrategy::Outline => M::gen_probestack(&mut insts, total_stacksize),
+                }
             }
         }
 
diff --git a/cranelift/codegen/src/settings.rs b/cranelift/codegen/src/settings.rs
index 0a214dfbde2e..4418f990a178 100644
--- a/cranelift/codegen/src/settings.rs
+++ b/cranelift/codegen/src/settings.rs
@@ -545,7 +545,7 @@ enable_llvm_abi_extensions = false
 unwind_info = true
 preserve_frame_pointers = false
 machine_code_cfg_info = false
-enable_probestack = true
+enable_probestack = false
 probestack_func_adjusts_sp = false
 enable_jump_tables = true
 enable_heap_access_spectre_mitigation = true
diff --git a/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif b/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif
new file mode 100644
index 000000000000..38838174b9b9
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/inline-probestack.clif
@@ -0,0 +1,71 @@
+test compile precise-output
+set enable_probestack=true
+set probestack_strategy=inline
+; This is the default and is equivalent to a page size of 4096
+set probestack_size_log2=12
+target aarch64
+
+
+; If the stack size is just one page, we can avoid the stack probe entirely
+function %single_page() -> i64 system_v {
+ss0 = explicit_slot 2048
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #2048
+; block0:
+;   mov x0, sp
+;   add sp, sp, #2048
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %unrolled() -> i64 system_v {
+ss0 = explicit_slot 12288
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   movn x16, #4095 ; str wzr, [sp, x16, SXTX]
+;   movn x16, #8191 ; str wzr, [sp, x16, SXTX]
+;   movn x16, #12287 ; str wzr, [sp, x16, SXTX]
+;   sub sp, sp, #12288
+; block0:
+;   mov x0, sp
+;   add sp, sp, #12288
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %large() -> i64 system_v {
+ss0 = explicit_slot 100000
+
+block0:
+  v1 = stack_addr.i64 ss0
+  return v1
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   movz x16, #0
+;   movz w17, #34464
+;   movk w17, w17, #1, LSL #16
+;   stack_probe_loop x16, x17, #4096
+;   movz w16, #34464
+;   movk w16, w16, #1, LSL #16
+;   sub sp, sp, x16, UXTX
+; block0:
+;   mov x0, sp
+;   movz w16, #34464
+;   movk w16, w16, #1, LSL #16
+;   add sp, sp, x16, UXTX
+;   ldp fp, lr, [sp], #16
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
index 703d8c4b35b0..e93dc41c429f 100644
--- a/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/stack-limit.clif
@@ -177,7 +177,7 @@ block0(v0: i64):
 
 ;   stp fp, lr, [sp, #-16]!
 ;   mov fp, sp
-;   movz w16, #6784 ; movk w16, w16, #6, LSL #16 ; add x16, x0, x16, UXTX ; ldr x16, [x16]
+;   movz w16, #6784 ; movk w16, w16, #6, LSL #16 ; ldr x16, [x0, x16, SXTX]
 ;   add x16, x16, #32
 ;   subs xzr, sp, x16, UXTX
 ;   b.hs 8 ; udf
diff --git a/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif b/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif
index 46f612db4299..7706815fa519 100644
--- a/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/stack-limit.clif
@@ -1,5 +1,6 @@
 test compile precise-output
 set unwind_info=false
+set enable_probestack=true
 target riscv64
 
 function %foo() {
diff --git a/cranelift/filetests/filetests/isa/riscv64/stack.clif b/cranelift/filetests/filetests/isa/riscv64/stack.clif
index c5073cf78cac..d70250ceb409 100644
--- a/cranelift/filetests/filetests/isa/riscv64/stack.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/stack.clif
@@ -1,5 +1,6 @@
 test compile precise-output
 set unwind_info=false
+set enable_probestack=true
 target riscv64
 
 function %stack_addr_small() -> i64 {
diff --git a/cranelift/filetests/filetests/runtests/inline-probestack.clif b/cranelift/filetests/filetests/runtests/inline-probestack.clif
index 21426137d86b..ea5e4241cc4b 100644
--- a/cranelift/filetests/filetests/runtests/inline-probestack.clif
+++ b/cranelift/filetests/filetests/runtests/inline-probestack.clif
@@ -6,9 +6,11 @@ set probestack_strategy=inline
 ; This is the default and is equivalent to a page size of 4096
 set probestack_size_log2=12
 target x86_64
+target aarch64
 ; Test also with 64k pages
 set probestack_size_log2=16
 target x86_64
+target aarch64
 
 ; Create a huge stack slot (1MB), way larger than PAGE_SIZE and touch the end of it.
 ; This guarantees that we bypass the guard page, cause a page fault the OS isn't expecting
diff --git a/cranelift/fuzzgen/src/lib.rs b/cranelift/fuzzgen/src/lib.rs
index 04b27cc3d259..d7df6df082a0 100644
--- a/cranelift/fuzzgen/src/lib.rs
+++ b/cranelift/fuzzgen/src/lib.rs
@@ -266,10 +266,9 @@ where
             builder.set(flag_name, value.as_str())?;
         }
 
-        // Optionally test inline stackprobes on x86
-        // TODO: inline stack probes are not available on AArch64
+        // Optionally test inline stackprobes on supported platforms
         // TODO: Test outlined stack probes.
-        if cfg!(target_arch = "x86_64") && bool::arbitrary(self.u)? {
+        if supports_inline_probestack() && bool::arbitrary(self.u)? {
             builder.enable("enable_probestack")?;
             builder.set("probestack_strategy", "inline")?;
 
@@ -300,7 +299,11 @@ where
         // into compilation anywhere, we leave it on unconditionally to make sure the generation doesn't panic.
         builder.enable("machine_code_cfg_info")?;
 
-        Ok(Flags::new(builder))
+        return Ok(Flags::new(builder));
+
+        fn supports_inline_probestack() -> bool {
+            cfg!(target_arch = "x86_64") || cfg!(target_arch = "aarch64")
+        }
     }
 
     pub fn generate_test(mut self) -> Result<TestCase> {
diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs
index ed9c4f3dd002..e8321baf7049 100644
--- a/crates/wasmtime/src/config.rs
+++ b/crates/wasmtime/src/config.rs
@@ -7,6 +7,7 @@ use std::fmt;
 #[cfg(feature = "cache")]
 use std::path::Path;
 use std::sync::Arc;
+use target_lexicon::Architecture;
 use wasmparser::WasmFeatures;
 #[cfg(feature = "cache")]
 use wasmtime_cache::CacheConfig;
@@ -1470,8 +1471,6 @@ impl Config {
 
     #[cfg(compiler)]
     pub(crate) fn build_compiler(&mut self) -> Result<Box<dyn wasmtime_environ::Compiler>> {
-        use target_lexicon::Architecture;
-
         let mut compiler = match self.compiler_config.strategy {
             Strategy::Auto | Strategy::Cranelift => wasmtime_cranelift::builder(),
         };
@@ -1480,35 +1479,30 @@ impl Config {
             compiler.target(target.clone())?;
         }
 
-        // On x86-64 targets, we enable stack probing by default.
+        // If probestack is enabled for a target, Wasmtime will always use the
+        // inline strategy which doesn't require us to define a `__probestack`
+        // function or similar.
+        self.compiler_config
+            .settings
+            .insert("probestack_strategy".into(), "inline".into());
+
+        let host = target_lexicon::Triple::host();
+        let target = self.compiler_config.target.as_ref().unwrap_or(&host);
+
+        // On supported targets, we enable stack probing by default.
         // This is required on Windows because of the way Windows
         // commits its stacks, but it's also a good idea on other
         // platforms to ensure guard pages are hit for large frame
         // sizes.
-        if self
-            .compiler_config
-            .target
-            .as_ref()
-            .map(|t| t.architecture == Architecture::X86_64)
-            .unwrap_or(cfg!(target_arch = "x86_64"))
-        {
+        if probestack_supported(target.architecture) {
             self.compiler_config
                 .flags
                 .insert("enable_probestack".into());
-            self.compiler_config
-                .settings
-                .insert("probestack_strategy".into(), "inline".into());
         }
 
         if self.native_unwind_info ||
-            // Windows always needs unwind info, since it is part of the ABI.
-            self
-                .compiler_config
-                .target
-                .as_ref()
-                .map_or(cfg!(target_os = "windows"), |target| {
-                    target.operating_system == target_lexicon::OperatingSystem::Windows
-                })
+             // Windows always needs unwind info, since it is part of the ABI.
+             target.operating_system == target_lexicon::OperatingSystem::Windows
         {
             if !self
                 .compiler_config
@@ -1905,3 +1899,10 @@ impl PoolingAllocationConfig {
         self
     }
 }
+
+pub(crate) fn probestack_supported(arch: Architecture) -> bool {
+    matches!(
+        arch,
+        Architecture::X86_64 | Architecture::Aarch64(_) | Architecture::Riscv64(_)
+    )
+}
diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs
index 24961693bc6f..dfb386bf5708 100644
--- a/crates/wasmtime/src/engine.rs
+++ b/crates/wasmtime/src/engine.rs
@@ -359,8 +359,6 @@ impl Engine {
         flag: &str,
         value: &FlagValue,
     ) -> Result<(), String> {
-        use target_lexicon::Architecture;
-
         let target = self.target();
         let ok = match flag {
             // These settings must all have be enabled, since their value
@@ -369,11 +367,8 @@ impl Engine {
             "avoid_div_traps" => *value == FlagValue::Bool(true),
             "libcall_call_conv" => *value == FlagValue::Enum("isa_default".into()),
             "preserve_frame_pointers" => *value == FlagValue::Bool(true),
-
-            // On x86-64 targets, stack probing is enabled by default.
-            "enable_probestack" => *value == FlagValue::Bool(target.architecture == Architecture::X86_64),
-            // On x86-64 targets, the default stack probing strategy is inline.
-            "probestack_strategy" => *value == FlagValue::Enum(if target.architecture == Architecture::X86_64 { "inline" } else { "outline" }.into()),
+            "enable_probestack" => *value == FlagValue::Bool(crate::config::probestack_supported(target.architecture)),
+            "probestack_strategy" => *value == FlagValue::Enum("inline".into()),
 
             // Features wasmtime doesn't use should all be disabled, since
             // otherwise if they are enabled it could change the behavior of
diff --git a/tests/host_segfault.rs b/tests/host_segfault.rs
index 6adac707e5d6..404fb4947028 100644
--- a/tests/host_segfault.rs
+++ b/tests/host_segfault.rs
@@ -144,6 +144,16 @@ fn main() {
             },
             true,
         ),
+        (
+            "overrun 8k with misconfigured host",
+            || overrun_with_big_module(8 << 10),
+            true,
+        ),
+        (
+            "overrun 32k with misconfigured host",
+            || overrun_with_big_module(32 << 10),
+            true,
+        ),
         #[cfg(not(any(target_arch = "riscv64")))]
         // Due to `InstanceAllocationStrategy::pooling()` trying to alloc more than 6000G memory space.
         // https://gitlab.com/qemu-project/qemu/-/issues/1214
@@ -178,6 +188,7 @@ fn main() {
         }
         Err(_) => {
             for (name, _test, stack_overflow) in tests {
+                println!("running {name}");
                 run_test(name, *stack_overflow);
             }
         }
@@ -245,7 +256,7 @@ fn is_stack_overflow(status: &ExitStatus, stderr: &str) -> bool {
     use std::os::unix::prelude::*;
 
     // The main thread might overflow or it might be from a fiber stack (SIGSEGV/SIGBUS)
-    stderr.contains("thread 'main' has overflowed its stack")
+    stderr.contains("has overflowed its stack")
         || match status.signal() {
             Some(libc::SIGSEGV) | Some(libc::SIGBUS) => true,
             _ => false,
@@ -267,3 +278,47 @@ fn is_stack_overflow(status: &ExitStatus, _stderr: &str) -> bool {
         _ => false,
     }
 }
+
+fn overrun_with_big_module(approx_stack: usize) {
+    // Each call to `$get` produces ten 8-byte values which need to be saved
+    // onto the stack, so divide `approx_stack` by 80 to get
+    // a rough number of calls to consume `approx_stack` stack.
+    let n = approx_stack / 10 / 8;
+
+    let mut s = String::new();
+    s.push_str("(module\n");
+    s.push_str("(func $big_stack\n");
+    for _ in 0..n {
+        s.push_str("call $get\n");
+    }
+    for _ in 0..n {
+        s.push_str("call $take\n");
+    }
+    s.push_str(")\n");
+    s.push_str("(func $get (result i64 i64 i64 i64 i64 i64 i64 i64 i64 i64) call $big_stack unreachable)\n");
+    s.push_str("(func $take (param i64 i64 i64 i64 i64 i64 i64 i64 i64 i64) unreachable)\n");
+    s.push_str("(func (export \"\") call $big_stack)\n");
+    s.push_str(")\n");
+
+    // Give 100MB of stack to wasm, representing a misconfigured host. Run the
+    // actual module on a 2MB stack in a child thread to guarantee that the
+    // module here will overrun the stack. This should deterministically hit the
+    // guard page.
+    let mut config = Config::default();
+    config.max_wasm_stack(100 << 20).async_stack_size(100 << 20);
+    let engine = Engine::new(&config).unwrap();
+    let module = Module::new(&engine, &s).unwrap();
+    let mut store = Store::new(&engine, ());
+    let i = Instance::new(&mut store, &module, &[]).unwrap();
+    let f = i.get_typed_func::<(), ()>(&mut store, "").unwrap();
+    std::thread::Builder::new()
+        .stack_size(2 << 20)
+        .spawn(move || {
+            println!("{CONFIRM}");
+            f.call(&mut store, ()).unwrap();
+        })
+        .unwrap()
+        .join()
+        .unwrap();
+    unreachable!();
+}