From 6c2a91825db38940d8db66d5fee9743461cfcd80 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Mon, 8 Apr 2024 09:35:41 -0700 Subject: [PATCH 1/2] Cranelift: Do not dedupe/GVN bitcasts from reference values Deduping bitcasts to integers from references can make the references no long longer live across safepoints, and instead only the bitcasted integer results would be. Because the reference is no longer live after the safepoint, the safepoint's stack map would not have an entry for the reference, which could result in the collector reclaiming an object too early, which is basically a use-after-free bug. Luckily, we sandbox the GC heap now, so such UAF bugs aren't memory unsafe, but they could potentially result in denial of service attacks. Either way, we don't want those bugs! On the other hand, it is technically fine to dedupe bitcasts *to* reference types. Doing so extends, rather than shortens, the live range of the GC reference. This potentially adds it to more stack maps than it otherwise would have been in, which means it might unnecessarily survive a GC it otherwise wouldn't have. But that is fine. Shrinking live ranges of GC references, and removing them from stack maps they otherwise should have been in, is the problematic transformation. --- cranelift/codegen/src/inst_predicates.rs | 21 +++++- .../filetests/egraph/bitcast-ref.clif | 71 +++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 cranelift/filetests/filetests/egraph/bitcast-ref.clif diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 707a01c1a5eb..c0e7ed66546f 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -44,6 +44,17 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool { trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data) } +/// Is the given instruction a bitcast to or from a reference type (e.g. `r64`)? +pub fn is_bitcast_from_ref(func: &Function, inst: Inst) -> bool { + let op = func.dfg.insts[inst].opcode(); + if op != ir::Opcode::Bitcast { + return false; + } + + let arg = func.dfg.inst_args(inst)[0]; + func.dfg.value_type(arg).is_ref() +} + /// Does the given instruction behave as a "pure" node with respect to /// aegraph semantics? /// @@ -58,6 +69,7 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool { } => flags.readonly() && flags.notrap(), _ => false, }; + // Multi-value results do not play nicely with much of the egraph // infrastructure. They are in practice used only for multi-return // calls and some other odd instructions (e.g. uadd_overflow) which, @@ -70,7 +82,11 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool { let op = func.dfg.insts[inst].opcode(); - has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op))) + has_one_result + && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op))) + // Cannot optimize ref-y bitcasts, as that can interact badly with + // safepoints and stack maps. + && !is_bitcast_from_ref(func, inst) } /// Can the given instruction be merged into another copy of itself? @@ -90,6 +106,9 @@ pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool { && !op.can_store() // Can only have idempotent side-effects. && (!has_side_effect(func, inst) || op.side_effects_idempotent()) + // Cannot optimize ref-y bitcasts, as that can interact badly with + // safepoints and stack maps. + && !is_bitcast_from_ref(func, inst) } /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load, diff --git a/cranelift/filetests/filetests/egraph/bitcast-ref.clif b/cranelift/filetests/filetests/egraph/bitcast-ref.clif new file mode 100644 index 000000000000..e6bb678186c6 --- /dev/null +++ b/cranelift/filetests/filetests/egraph/bitcast-ref.clif @@ -0,0 +1,71 @@ +test optimize precise-output +set opt_level=speed +target x86_64 + +function %bitcast_from_r64(i64 vmctx, r64) -> i32 fast { + sig0 = (i32) fast + fn0 = colocated %foo sig0 + +block0(v0: i64, v1: r64): + v2 = bitcast.i64 v1 + v3 = ireduce.i32 v2 + + call fn0(v3) + + ;; We MUST NOT dedupe the bitcast *from* the reference type, as that would + ;; make `v1` no longer live across the `call`, which would mean that it + ;; would not show up in the `call`'s safepoint's stack map, which could + ;; result in the collector reclaiming an object too early, which is basically + ;; a use-after-free bug. + v4 = bitcast.i64 v1 + v5 = ireduce.i32 v4 + + return v5 +} + +; function %bitcast_from_r64(i64 vmctx, r64) -> i32 fast { +; sig0 = (i32) fast +; fn0 = colocated %foo sig0 +; +; block0(v0: i64, v1: r64): +; v2 = bitcast.i64 v1 +; v3 = ireduce.i32 v2 +; call fn0(v3) +; v4 = bitcast.i64 v1 +; v5 = ireduce.i32 v4 +; return v5 +; } + +function %bitcast_to_r64(i64 vmctx, i32) -> r64 fast { + sig0 = (r64) fast + fn0 = colocated %foo sig0 + +block0(v0: i64, v1: i32): + v2 = uextend.i64 v1 + v3 = bitcast.r64 v2 + + call fn0(v3) + + ;; On the other hand, it is technically fine to dedupe bitcasts *to* + ;; reference types. Doing so extends the live range of the GC reference -- + ;; potentially adding it to more stack maps than it otherwise would have + ;; been in, which means it might unnecessarily survive a GC it otherwise + ;; wouldn't have -- but that is okay. Shrinking live ranges of GC + ;; references, and removing them from stack maps they otherwise would have + ;; been in, is the problematic transformation. + v4 = uextend.i64 v1 + v5 = bitcast.r64 v4 + + return v5 +} + +; function %bitcast_to_r64(i64 vmctx, i32) -> r64 fast { +; sig0 = (r64) fast +; fn0 = colocated %foo sig0 +; +; block0(v0: i64, v1: i32): +; v2 = uextend.i64 v1 +; v3 = bitcast.r64 v2 +; call fn0(v3) +; return v3 +; } From 9d38d7939e485d9c6dc18a4ab865dfbc805f4489 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Mon, 8 Apr 2024 12:55:55 -0700 Subject: [PATCH 2/2] Add additional logging and debug asserts for GC stuff --- cranelift/codegen/src/bitset.rs | 24 +++- cranelift/codegen/src/machinst/buffer.rs | 2 +- crates/runtime/src/gc/enabled/drc.rs | 107 ++++++++++++++++-- crates/runtime/src/gc/gc_runtime.rs | 17 +-- crates/runtime/src/gc/host_data.rs | 5 +- crates/runtime/src/libcalls.rs | 1 + crates/runtime/src/table.rs | 8 +- .../wasmtime/src/runtime/externals/global.rs | 4 +- .../wasmtime/src/runtime/externals/table.rs | 4 +- .../src/runtime/gc/enabled/rooting.rs | 13 ++- crates/wasmtime/src/runtime/store.rs | 7 +- 11 files changed, 164 insertions(+), 28 deletions(-) diff --git a/cranelift/codegen/src/bitset.rs b/cranelift/codegen/src/bitset.rs index 7d81b3221cdf..70c86d722210 100644 --- a/cranelift/codegen/src/bitset.rs +++ b/cranelift/codegen/src/bitset.rs @@ -10,13 +10,35 @@ use core::mem::size_of; use core::ops::{Add, BitOr, Shl, Sub}; /// A small bitset built on a single primitive integer type -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +#[derive(Clone, Copy, Default, PartialEq, Eq)] #[cfg_attr( feature = "enable-serde", derive(serde_derive::Serialize, serde_derive::Deserialize) )] pub struct BitSet(pub T); +impl std::fmt::Debug for BitSet +where + T: Into + + From + + BitOr + + Shl + + Sub + + Add + + PartialEq + + Copy, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut s = f.debug_struct(std::any::type_name::()); + for i in 0..Self::bits() { + use std::string::ToString; + let i = u32::try_from(i).unwrap(); + s.field(&i.to_string(), &self.contains(i)); + } + s.finish() + } +} + impl BitSet where T: Into diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs index 8abe64a687e1..a18adea65036 100644 --- a/cranelift/codegen/src/machinst/buffer.rs +++ b/cranelift/codegen/src/machinst/buffer.rs @@ -1675,7 +1675,7 @@ impl MachBuffer { (start_offset, end_offset) } }; - trace!("Adding stack map for offsets {start:#x}..{end:#x}"); + trace!("Adding stack map for offsets {start:#x}..{end:#x}: {stack_map:?}"); self.stack_maps.push(MachStackMap { offset: start, offset_end: end, diff --git a/crates/runtime/src/gc/enabled/drc.rs b/crates/runtime/src/gc/enabled/drc.rs index 6987984da087..56bba9838687 100644 --- a/crates/runtime/src/gc/enabled/drc.rs +++ b/crates/runtime/src/gc/enabled/drc.rs @@ -160,6 +160,12 @@ impl DrcHeap { let drc_ref = drc_ref(gc_ref); let header = self.index_mut(&drc_ref); + debug_assert_ne!( + *header.ref_count.get_mut(), + 0, + "{:#p} is supposedly live; should have nonzero ref count", + *gc_ref + ); *header.ref_count.get_mut() += 1; log::trace!( "increment {:#p} ref count -> {}", @@ -179,6 +185,12 @@ impl DrcHeap { let drc_ref = drc_ref(gc_ref); let header = self.index_mut(drc_ref); + debug_assert_ne!( + *header.ref_count.get_mut(), + 0, + "{:#p} is supposedly live; should have nonzero ref count", + *gc_ref + ); *header.ref_count.get_mut() -= 1; log::trace!( "decrement {:#p} ref count -> {}", @@ -261,6 +273,13 @@ impl DrcHeap { continue; } + debug_assert_ne!( + *self.index_mut(drc_ref(&gc_ref)).ref_count.get_mut(), + 0, + "{gc_ref:#p} is on the Wasm stack and therefore should be held \ + by the activations table; should have nonzero ref count", + ); + log::trace!("Found GC reference on the stack: {:#p}", gc_ref); let is_new = self .activations_table @@ -272,26 +291,76 @@ impl DrcHeap { } } - /// Sweep the bump allocation table after we've discovered our precise stack - /// roots. - fn sweep(&mut self, host_data_table: &mut ExternRefHostDataTable) { - // Sweep our bump chunk. + fn iter_bump_chunk(&mut self) -> impl Iterator + '_ { let num_filled = self.activations_table.num_filled_in_bump_chunk(); + self.activations_table + .alloc + .chunk + .iter_mut() + .take(num_filled) + .map(|slot| { + let r64 = *slot.get_mut(); + VMGcRef::from_r64(r64) + .expect("valid r64") + .expect("non-null") + }) + } + + #[inline(never)] + #[cold] + fn log_gc_ref_set(prefix: &str, items: impl Iterator) { + assert!(log::log_enabled!(log::Level::Trace)); + let mut set = "{".to_string(); + let mut any = false; + for gc_ref in items { + any = true; + set += &format!("\n {gc_ref:#p},"); + } + if any { + set.push('\n'); + } + set.push('}'); + log::trace!("{prefix}: {set}"); + } + + fn drain_bump_chunk(&mut self, mut f: impl FnMut(&mut Self, VMGcRef)) { + let num_filled = self.activations_table.num_filled_in_bump_chunk(); + + // Temporarily take the allocation out of `self` to avoid conflicting + // borrows. let mut alloc = mem::take(&mut self.activations_table.alloc); for slot in alloc.chunk.iter_mut().take(num_filled) { let r64 = mem::take(slot.get_mut()); let gc_ref = VMGcRef::from_r64(r64) .expect("valid r64") .expect("non-null"); - self.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref); - + f(self, gc_ref); *slot.get_mut() = 0; } + debug_assert!( - alloc.chunk.iter().all(|slot| unsafe { *slot.get() == 0 }), - "after sweeping the bump chunk, all slots should be empty" + alloc.chunk.iter_mut().all(|slot| *slot.get_mut() == 0), + "after sweeping the bump chunk, all slots should be empty", ); + debug_assert!(self.activations_table.alloc.chunk.is_empty()); + self.activations_table.alloc = alloc; + } + + /// Sweep the bump allocation table after we've discovered our precise stack + /// roots. + fn sweep(&mut self, host_data_table: &mut ExternRefHostDataTable) { + if log::log_enabled!(log::Level::Trace) { + Self::log_gc_ref_set("bump chunk before sweeping", self.iter_bump_chunk()); + } + + // Sweep our bump chunk. + log::trace!("Begin sweeping bump chunk"); + self.drain_bump_chunk(|heap, gc_ref| { + heap.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref); + }); + log::trace!("Done sweeping bump chunk"); + if self.activations_table.alloc.chunk.is_empty() { // If this is the first collection, then the bump chunk is empty // since we lazily allocate it. Force that lazy allocation now so we @@ -302,6 +371,16 @@ impl DrcHeap { self.activations_table.alloc.reset(); } + if log::log_enabled!(log::Level::Trace) { + Self::log_gc_ref_set( + "hash set before sweeping", + self.activations_table + .over_approximated_stack_roots + .iter() + .map(|r| r.unchecked_copy()), + ); + } + // The current `precise_stack_roots` becomes our new over-appoximated // set for the next GC cycle. mem::swap( @@ -315,14 +394,26 @@ impl DrcHeap { // Note that this may run arbitrary code as we run gc_ref // destructors. Because of our `&mut` borrow above on this table, // though, we're guaranteed that nothing will touch this table. + log::trace!("Begin sweeping hash set"); let mut precise_stack_roots = mem::take(&mut self.activations_table.precise_stack_roots); for gc_ref in precise_stack_roots.drain() { self.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref); } + log::trace!("Done sweeping hash set"); // Make sure to replace the `precise_stack_roots` so that we reuse any // allocated capacity. self.activations_table.precise_stack_roots = precise_stack_roots; + + if log::log_enabled!(log::Level::Trace) { + Self::log_gc_ref_set( + "hash set after sweeping", + self.activations_table + .over_approximated_stack_roots + .iter() + .map(|r| r.unchecked_copy()), + ); + } } } diff --git a/crates/runtime/src/gc/gc_runtime.rs b/crates/runtime/src/gc/gc_runtime.rs index e3cbff94f070..1d2741dc5a8f 100644 --- a/crates/runtime/src/gc/gc_runtime.rs +++ b/crates/runtime/src/gc/gc_runtime.rs @@ -311,19 +311,22 @@ enum RawGcRoot { impl GcRootsList { /// Add a GC root that is inside a Wasm stack frame to this list. - // - // TODO: we shouldn't need to differentiate between on-stack GC roots and - // others, and both should just be `*mut VMGcRef`, but because Cranelift - // doesn't support `r32` on 64-bit platforms yet, we have to use `r64` for - // roots on the Wasm stack instead. #[inline] - pub fn add_wasm_stack_root(&mut self, ptr_to_root: SendSyncPtr) { + pub unsafe fn add_wasm_stack_root(&mut self, ptr_to_root: SendSyncPtr) { + log::trace!( + "Adding Wasm stack root: {:#p}", + VMGcRef::from_r64(*ptr_to_root.as_ref()).unwrap().unwrap() + ); self.0.push(RawGcRoot::Stack(ptr_to_root)); } /// Add a GC root to this list. #[inline] - pub fn add_root(&mut self, ptr_to_root: SendSyncPtr) { + pub unsafe fn add_root(&mut self, ptr_to_root: SendSyncPtr) { + log::trace!( + "Adding non-stack root: {:#p}", + ptr_to_root.as_ref().unchecked_copy() + ); self.0.push(RawGcRoot::NonStack(ptr_to_root)) } diff --git a/crates/runtime/src/gc/host_data.rs b/crates/runtime/src/gc/host_data.rs index 933935bedc5e..13161b508fb9 100644 --- a/crates/runtime/src/gc/host_data.rs +++ b/crates/runtime/src/gc/host_data.rs @@ -39,11 +39,14 @@ impl ExternRefHostDataTable { /// Allocate a new `externref` host data value. pub fn alloc(&mut self, value: Box) -> ExternRefHostDataId { let id = self.slab.alloc(value); - ExternRefHostDataId(id) + let id = ExternRefHostDataId(id); + log::trace!("allocated new externref host data: {id:?}"); + id } /// Deallocate an `externref` host data value. pub fn dealloc(&mut self, id: ExternRefHostDataId) -> Box { + log::trace!("deallocated externref host data: {id:?}"); self.slab.dealloc(id.0) } diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs index 1095948bae00..c2074eeeefd2 100644 --- a/crates/runtime/src/libcalls.rs +++ b/crates/runtime/src/libcalls.rs @@ -380,6 +380,7 @@ unsafe fn drop_gc_ref(instance: &mut Instance, gc_ref: *mut u8) { let gc_ref = VMGcRef::from_r64(u64::try_from(gc_ref as usize).unwrap()) .expect("valid r64") .expect("non-null VMGcRef"); + log::trace!("libcalls::drop_gc_ref({gc_ref:?})"); (*instance.store()).gc_store().drop_gc_ref(gc_ref); } diff --git a/crates/runtime/src/table.rs b/crates/runtime/src/table.rs index 4a0ea6019eba..f138d1a37124 100644 --- a/crates/runtime/src/table.rs +++ b/crates/runtime/src/table.rs @@ -467,16 +467,16 @@ impl Table { TableElement::FuncRef(f) => { self.funcrefs_mut()[start..end].fill(TaggedFuncRef::from(f)); } - TableElement::GcRef(e) => { + TableElement::GcRef(r) => { // Clone the init GC reference into each table slot. for slot in &mut self.gc_refs_mut()[start..end] { - gc_store.write_gc_ref(slot, e.as_ref()); + gc_store.write_gc_ref(slot, r.as_ref()); } // Drop the init GC reference, since we aren't holding onto this // reference anymore, only the clones in the table. - if let Some(e) = e { - gc_store.drop_gc_ref(e); + if let Some(r) = r { + gc_store.drop_gc_ref(r); } } TableElement::UninitFunc => { diff --git a/crates/wasmtime/src/runtime/externals/global.rs b/crates/wasmtime/src/runtime/externals/global.rs index 8276e6b98c22..dc58268d29d8 100644 --- a/crates/wasmtime/src/runtime/externals/global.rs +++ b/crates/wasmtime/src/runtime/externals/global.rs @@ -213,7 +213,9 @@ impl Global { if let Some(gc_ref) = unsafe { (*store[self.0].definition).as_gc_ref() } { let gc_ref = NonNull::from(gc_ref); let gc_ref = SendSyncPtr::new(gc_ref); - gc_roots_list.add_root(gc_ref); + unsafe { + gc_roots_list.add_root(gc_ref); + } } } } diff --git a/crates/wasmtime/src/runtime/externals/table.rs b/crates/wasmtime/src/runtime/externals/table.rs index 145723a36aaa..0adc6ad9450e 100644 --- a/crates/wasmtime/src/runtime/externals/table.rs +++ b/crates/wasmtime/src/runtime/externals/table.rs @@ -387,7 +387,9 @@ impl Table { if let Some(gc_ref) = gc_ref { let gc_ref = NonNull::from(gc_ref); let gc_ref = SendSyncPtr::new(gc_ref); - gc_roots_list.add_root(gc_ref); + unsafe { + gc_roots_list.add_root(gc_ref); + } } } } diff --git a/crates/wasmtime/src/runtime/gc/enabled/rooting.rs b/crates/wasmtime/src/runtime/gc/enabled/rooting.rs index 06193a529192..293765e0a49d 100644 --- a/crates/wasmtime/src/runtime/gc/enabled/rooting.rs +++ b/crates/wasmtime/src/runtime/gc/enabled/rooting.rs @@ -370,12 +370,21 @@ struct LifoRoot { impl RootSet { pub(crate) fn trace_roots(&mut self, gc_roots_list: &mut GcRootsList) { + log::trace!("Begin trace user LIFO roots"); for root in &mut self.lifo_roots { - gc_roots_list.add_root((&mut root.gc_ref).into()); + unsafe { + gc_roots_list.add_root((&mut root.gc_ref).into()); + } } + log::trace!("End trace user LIFO roots"); + + log::trace!("Begin trace user manual roots"); for (_id, root) in self.manually_rooted.iter_mut() { - gc_roots_list.add_root(root.into()); + unsafe { + gc_roots_list.add_root(root.into()); + } } + log::trace!("End trace user manual roots"); } /// Enter a LIFO rooting scope. diff --git a/crates/wasmtime/src/runtime/store.rs b/crates/wasmtime/src/runtime/store.rs index f54c5846f88a..c21273fab4fb 100644 --- a/crates/wasmtime/src/runtime/store.rs +++ b/crates/wasmtime/src/runtime/store.rs @@ -1691,8 +1691,11 @@ impl StoreOpaque { .expect("we should never use the high 32 bits of an r64"); if gc_ref.is_some() { - gc_roots_list - .add_wasm_stack_root(SendSyncPtr::new(NonNull::new(stack_slot).unwrap())); + unsafe { + gc_roots_list.add_wasm_stack_root(SendSyncPtr::new( + NonNull::new(stack_slot).unwrap(), + )); + } } }