From b690a15f3f6b098370450f3c4e278963ba449180 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 19 Mar 2021 11:29:00 -0700 Subject: [PATCH] Optimize calling a WebAssembly function This commit implements a few optimizations, mainly inlining, that should improve the performance of calling a WebAssembly function. This code path can be quite hot depending on the embedding case and we hadn't really put much effort into optimizing the nitty gritty. The predominant optimization here is adding `#[inline]` to trivial functions so performance is improved without having to compile with LTO. Another optimization is to call `lazy_per_thread_init` when traps are initialized per-thread (when a `Store` is created) rather than each time a function is called. The next optimization is to change the unwind reason in the `CallThreadState` to `MaybeUninit` to avoid extra checks in the default case about whether we need to drop its variants (since in the happy path we never need to drop it). The final optimization is to optimize out a few checks when `async` support is disabled for a small speed boost. In a small benchmark where wasmtime calls a simple wasm function my macOS computer dropped from 110ns to 86ns overhead, a 20% decrease. The macOS overhead is still largely dominated by the global lock acquisition and hash table management for traps right now, but I suspect the Linux overhead is much better (should be on the order of ~30 or so ns). We still have a long way to go to compete with SpiderMonkey which, in testing, seem to have ~6ns overhead in calling the same wasm function on my computer. --- crates/runtime/src/externref.rs | 2 ++ crates/runtime/src/traphandlers.rs | 52 +++++++++++++++--------------- crates/wasmtime/src/engine.rs | 1 + crates/wasmtime/src/func.rs | 4 ++- crates/wasmtime/src/func/typed.rs | 2 +- crates/wasmtime/src/store.rs | 9 +++++- 6 files changed, 41 insertions(+), 29 deletions(-) diff --git a/crates/runtime/src/externref.rs b/crates/runtime/src/externref.rs index 7fd964b8e3d7..01c2965777e4 100644 --- a/crates/runtime/src/externref.rs +++ b/crates/runtime/src/externref.rs @@ -752,6 +752,7 @@ impl VMExternRefActivationsTable { /// // call has returned. /// drop(auto_reset_canary); /// ``` + #[inline] pub fn set_stack_canary<'a>(&'a self, canary: &u8) -> impl Drop + 'a { let should_reset = if self.stack_canary.get().is_none() { let canary = canary as *const u8 as *mut u8; @@ -775,6 +776,7 @@ impl VMExternRefActivationsTable { } impl Drop for AutoResetCanary<'_> { + #[inline] fn drop(&mut self) { if self.should_reset { debug_assert!(self.table.stack_canary.get().is_some()); diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs index d9fed4c17e3d..00acc79a86b9 100644 --- a/crates/runtime/src/traphandlers.rs +++ b/crates/runtime/src/traphandlers.rs @@ -4,8 +4,9 @@ use crate::VMInterrupts; use backtrace::Backtrace; use std::any::Any; -use std::cell::Cell; +use std::cell::{Cell, UnsafeCell}; use std::error::Error; +use std::mem::MaybeUninit; use std::ptr; use std::sync::atomic::{AtomicUsize, Ordering::SeqCst}; use std::sync::Once; @@ -47,9 +48,10 @@ pub use sys::SignalHandler; /// function needs to be called at the end of the startup process, after other /// handlers have been installed. This function can thus be called multiple /// times, having no effect after the first call. -pub fn init_traps() { +pub fn init_traps() -> Result<(), Trap> { static INIT: Once = Once::new(); INIT.call_once(|| unsafe { sys::platform_init() }); + sys::lazy_per_thread_init() } /// Raises a user-defined trap immediately. @@ -155,8 +157,6 @@ pub unsafe fn catch_traps(trap_info: &impl TrapInfo, mut closure: F) -> Resul where F: FnMut(), { - sys::lazy_per_thread_init()?; - return CallThreadState::new(trap_info).with(|cx| { RegisterSetjmp( cx.jmp_buf.as_ptr(), @@ -191,7 +191,7 @@ pub fn out_of_gas() { /// Temporary state stored on the stack which is registered in the `tls` module /// below for calls into wasm. pub struct CallThreadState<'a> { - unwind: Cell, + unwind: UnsafeCell>, jmp_buf: Cell<*const u8>, handling_trap: Cell, trap_info: &'a (dyn TrapInfo + 'a), @@ -232,7 +232,6 @@ pub unsafe trait TrapInfo { } enum UnwindReason { - None, Panic(Box), UserTrap(Box), LibTrap(Trap), @@ -240,9 +239,10 @@ enum UnwindReason { } impl<'a> CallThreadState<'a> { + #[inline] fn new(trap_info: &'a (dyn TrapInfo + 'a)) -> CallThreadState<'a> { CallThreadState { - unwind: Cell::new(UnwindReason::None), + unwind: UnsafeCell::new(MaybeUninit::uninit()), jmp_buf: Cell::new(ptr::null()), handling_trap: Cell::new(false), trap_info, @@ -253,18 +253,13 @@ impl<'a> CallThreadState<'a> { fn with(self, closure: impl FnOnce(&CallThreadState) -> i32) -> Result<(), Trap> { let _reset = self.update_stack_limit()?; let ret = tls::set(&self, || closure(&self)); - match self.unwind.replace(UnwindReason::None) { - UnwindReason::None => { - debug_assert_eq!(ret, 1); - Ok(()) - } - UnwindReason::UserTrap(data) => { - debug_assert_eq!(ret, 0); - Err(Trap::User(data)) - } + if ret != 0 { + return Ok(()); + } + match unsafe { (*self.unwind.get()).as_ptr().read() } { + UnwindReason::UserTrap(data) => Err(Trap::User(data)), UnwindReason::LibTrap(trap) => Err(trap), UnwindReason::JitTrap { backtrace, pc } => { - debug_assert_eq!(ret, 0); let interrupts = self.trap_info.interrupts(); let maybe_interrupted = interrupts.stack_limit.load(SeqCst) == wasmtime_environ::INTERRUPTED; @@ -274,10 +269,7 @@ impl<'a> CallThreadState<'a> { maybe_interrupted, }) } - UnwindReason::Panic(panic) => { - debug_assert_eq!(ret, 0); - std::panic::resume_unwind(panic) - } + UnwindReason::Panic(panic) => std::panic::resume_unwind(panic), } } @@ -310,6 +302,7 @@ impl<'a> CallThreadState<'a> { /// /// Note that this function must be called with `self` on the stack, not the /// heap/etc. + #[inline] fn update_stack_limit(&self) -> Result { // Determine the stack pointer where, after which, any wasm code will // immediately trap. This is checked on the entry to all wasm functions. @@ -361,6 +354,7 @@ impl<'a> CallThreadState<'a> { struct Reset<'a>(bool, &'a AtomicUsize); impl Drop for Reset<'_> { + #[inline] fn drop(&mut self) { if self.0 { self.1.store(usize::max_value(), SeqCst); @@ -372,8 +366,8 @@ impl<'a> CallThreadState<'a> { } fn unwind_with(&self, reason: UnwindReason) -> ! { - self.unwind.replace(reason); unsafe { + (*self.unwind.get()).as_mut_ptr().write(reason); Unwind(self.jmp_buf.get()); } } @@ -432,16 +426,21 @@ impl<'a> CallThreadState<'a> { fn capture_backtrace(&self, pc: *const u8) { let backtrace = Backtrace::new_unresolved(); - self.unwind.replace(UnwindReason::JitTrap { - backtrace, - pc: pc as usize, - }); + unsafe { + (*self.unwind.get()) + .as_mut_ptr() + .write(UnwindReason::JitTrap { + backtrace, + pc: pc as usize, + }); + } } } struct ResetCell<'a, T: Copy>(&'a Cell, T); impl Drop for ResetCell<'_, T> { + #[inline] fn drop(&mut self) { self.0.set(self.1); } @@ -544,6 +543,7 @@ mod tls { struct Reset<'a, 'b>(&'a CallThreadState<'b>); impl Drop for Reset<'_, '_> { + #[inline] fn drop(&mut self) { raw::replace(self.0.prev.replace(ptr::null())); } diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index 245c02a300d2..eab429d509a1 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -56,6 +56,7 @@ impl Engine { } /// Returns the configuration settings that this engine is using. + #[inline] pub fn config(&self) -> &Config { &self.inner.config } diff --git a/crates/wasmtime/src/func.rs b/crates/wasmtime/src/func.rs index af33ed2fe313..66eef8660241 100644 --- a/crates/wasmtime/src/func.rs +++ b/crates/wasmtime/src/func.rs @@ -804,7 +804,7 @@ impl Func { /// initiates a panic. pub fn call(&self, params: &[Val]) -> Result> { assert!( - !self.store().async_support(), + !cfg!(feature = "async") || !self.store().async_support(), "must use `call_async` when async support is enabled on the config", ); self._call(params) @@ -926,6 +926,7 @@ impl Func { } /// Get a reference to this function's store. + #[inline] pub fn store(&self) -> &Store { &self.instance.store } @@ -1414,6 +1415,7 @@ impl Caller<'_> { } /// Get a reference to the caller's store. + #[inline] pub fn store(&self) -> &Store { self.store } diff --git a/crates/wasmtime/src/func/typed.rs b/crates/wasmtime/src/func/typed.rs index 1558c919824f..d9059a2fb308 100644 --- a/crates/wasmtime/src/func/typed.rs +++ b/crates/wasmtime/src/func/typed.rs @@ -53,7 +53,7 @@ where /// connected to an asynchronous store. pub fn call(&self, params: Params) -> Result { assert!( - !self.func.store().async_support(), + !cfg!(feature = "async") || !self.func.store().async_support(), "must use `call_async` with async stores" ); unsafe { self._call(params) } diff --git a/crates/wasmtime/src/store.rs b/crates/wasmtime/src/store.rs index 6701506b9a39..df177fb06113 100644 --- a/crates/wasmtime/src/store.rs +++ b/crates/wasmtime/src/store.rs @@ -136,7 +136,7 @@ impl Store { // once-per-thread. Platforms like Unix, however, only require this // once-per-program. In any case this is safe to call many times and // each one that's not relevant just won't do anything. - wasmtime_runtime::init_traps(); + wasmtime_runtime::init_traps().expect("failed to initialize trap handling"); Store { inner: Rc::new(StoreInner { @@ -209,6 +209,7 @@ impl Store { } /// Returns the [`Engine`] that this store is associated with. + #[inline] pub fn engine(&self) -> &Engine { &self.inner.engine } @@ -503,10 +504,12 @@ impl Store { } } + #[inline] pub(crate) fn externref_activations_table(&self) -> &VMExternRefActivationsTable { &self.inner.externref_activations_table } + #[inline] pub(crate) fn stack_map_registry(&self) -> &StackMapRegistry { &self.inner.stack_map_registry } @@ -655,6 +658,7 @@ impl Store { }); } + #[inline] pub(crate) fn async_support(&self) -> bool { self.inner.engine.config().async_support } @@ -915,6 +919,7 @@ impl Store { } unsafe impl TrapInfo for Store { + #[inline] fn as_any(&self) -> &dyn Any { self } @@ -930,6 +935,7 @@ unsafe impl TrapInfo for Store { false } + #[inline] fn max_wasm_stack(&self) -> usize { self.engine().config().max_wasm_stack } @@ -956,6 +962,7 @@ unsafe impl TrapInfo for Store { } } + #[inline] fn interrupts(&self) -> &VMInterrupts { &self.inner.interrupts }