Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cranelift: Do not dedupe/GVN bitcasts from reference values #8317

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion cranelift/codegen/src/bitset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,35 @@ use core::mem::size_of;
use core::ops::{Add, BitOr, Shl, Sub};

/// A small bitset built on a single primitive integer type
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
#[derive(Clone, Copy, Default, PartialEq, Eq)]
#[cfg_attr(
feature = "enable-serde",
derive(serde_derive::Serialize, serde_derive::Deserialize)
)]
pub struct BitSet<T>(pub T);

impl<T> std::fmt::Debug for BitSet<T>
where
T: Into<u32>
+ From<u8>
+ BitOr<T, Output = T>
+ Shl<u8, Output = T>
+ Sub<T, Output = T>
+ Add<T, Output = T>
+ PartialEq
+ Copy,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let mut s = f.debug_struct(std::any::type_name::<Self>());
for i in 0..Self::bits() {
use std::string::ToString;
let i = u32::try_from(i).unwrap();
s.field(&i.to_string(), &self.contains(i));
}
s.finish()
}
}

impl<T> BitSet<T>
where
T: Into<u32>
Expand Down
21 changes: 20 additions & 1 deletion cranelift/codegen/src/inst_predicates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
}

/// Is the given instruction a bitcast to or from a reference type (e.g. `r64`)?
pub fn is_bitcast_from_ref(func: &Function, inst: Inst) -> bool {
let op = func.dfg.insts[inst].opcode();
if op != ir::Opcode::Bitcast {
return false;
}

let arg = func.dfg.inst_args(inst)[0];
func.dfg.value_type(arg).is_ref()
}

/// Does the given instruction behave as a "pure" node with respect to
/// aegraph semantics?
///
Expand All @@ -58,6 +69,7 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool {
} => flags.readonly() && flags.notrap(),
_ => false,
};

// Multi-value results do not play nicely with much of the egraph
// infrastructure. They are in practice used only for multi-return
// calls and some other odd instructions (e.g. uadd_overflow) which,
Expand All @@ -70,7 +82,11 @@ pub fn is_pure_for_egraph(func: &Function, inst: Inst) -> bool {

let op = func.dfg.insts[inst].opcode();

has_one_result && (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op)))
has_one_result
&& (is_readonly_load || (!op.can_load() && !trivially_has_side_effects(op)))
// Cannot optimize ref-y bitcasts, as that can interact badly with
// safepoints and stack maps.
&& !is_bitcast_from_ref(func, inst)
}

/// Can the given instruction be merged into another copy of itself?
Expand All @@ -90,6 +106,9 @@ pub fn is_mergeable_for_egraph(func: &Function, inst: Inst) -> bool {
&& !op.can_store()
// Can only have idempotent side-effects.
&& (!has_side_effect(func, inst) || op.side_effects_idempotent())
// Cannot optimize ref-y bitcasts, as that can interact badly with
// safepoints and stack maps.
&& !is_bitcast_from_ref(func, inst)
}

/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load,
Expand Down
2 changes: 1 addition & 1 deletion cranelift/codegen/src/machinst/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1675,7 +1675,7 @@ impl<I: VCodeInst> MachBuffer<I> {
(start_offset, end_offset)
}
};
trace!("Adding stack map for offsets {start:#x}..{end:#x}");
trace!("Adding stack map for offsets {start:#x}..{end:#x}: {stack_map:?}");
self.stack_maps.push(MachStackMap {
offset: start,
offset_end: end,
Expand Down
71 changes: 71 additions & 0 deletions cranelift/filetests/filetests/egraph/bitcast-ref.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
test optimize precise-output
set opt_level=speed
target x86_64

function %bitcast_from_r64(i64 vmctx, r64) -> i32 fast {
sig0 = (i32) fast
fn0 = colocated %foo sig0

block0(v0: i64, v1: r64):
v2 = bitcast.i64 v1
v3 = ireduce.i32 v2

call fn0(v3)

;; We MUST NOT dedupe the bitcast *from* the reference type, as that would
;; make `v1` no longer live across the `call`, which would mean that it
;; would not show up in the `call`'s safepoint's stack map, which could
;; result in the collector reclaiming an object too early, which is basically
;; a use-after-free bug.
v4 = bitcast.i64 v1
v5 = ireduce.i32 v4

return v5
}

; function %bitcast_from_r64(i64 vmctx, r64) -> i32 fast {
; sig0 = (i32) fast
; fn0 = colocated %foo sig0
;
; block0(v0: i64, v1: r64):
; v2 = bitcast.i64 v1
; v3 = ireduce.i32 v2
; call fn0(v3)
; v4 = bitcast.i64 v1
; v5 = ireduce.i32 v4
; return v5
; }

function %bitcast_to_r64(i64 vmctx, i32) -> r64 fast {
sig0 = (r64) fast
fn0 = colocated %foo sig0

block0(v0: i64, v1: i32):
v2 = uextend.i64 v1
v3 = bitcast.r64 v2

call fn0(v3)

;; On the other hand, it is technically fine to dedupe bitcasts *to*
;; reference types. Doing so extends the live range of the GC reference --
;; potentially adding it to more stack maps than it otherwise would have
;; been in, which means it might unnecessarily survive a GC it otherwise
;; wouldn't have -- but that is okay. Shrinking live ranges of GC
;; references, and removing them from stack maps they otherwise would have
;; been in, is the problematic transformation.
v4 = uextend.i64 v1
v5 = bitcast.r64 v4

return v5
}

; function %bitcast_to_r64(i64 vmctx, i32) -> r64 fast {
; sig0 = (r64) fast
; fn0 = colocated %foo sig0
;
; block0(v0: i64, v1: i32):
; v2 = uextend.i64 v1
; v3 = bitcast.r64 v2
; call fn0(v3)
; return v3
; }
107 changes: 99 additions & 8 deletions crates/runtime/src/gc/enabled/drc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@ impl DrcHeap {

let drc_ref = drc_ref(gc_ref);
let header = self.index_mut(&drc_ref);
debug_assert_ne!(
*header.ref_count.get_mut(),
0,
"{:#p} is supposedly live; should have nonzero ref count",
*gc_ref
);
*header.ref_count.get_mut() += 1;
log::trace!(
"increment {:#p} ref count -> {}",
Expand All @@ -179,6 +185,12 @@ impl DrcHeap {

let drc_ref = drc_ref(gc_ref);
let header = self.index_mut(drc_ref);
debug_assert_ne!(
*header.ref_count.get_mut(),
0,
"{:#p} is supposedly live; should have nonzero ref count",
*gc_ref
);
*header.ref_count.get_mut() -= 1;
log::trace!(
"decrement {:#p} ref count -> {}",
Expand Down Expand Up @@ -261,6 +273,13 @@ impl DrcHeap {
continue;
}

debug_assert_ne!(
*self.index_mut(drc_ref(&gc_ref)).ref_count.get_mut(),
0,
"{gc_ref:#p} is on the Wasm stack and therefore should be held \
by the activations table; should have nonzero ref count",
);

log::trace!("Found GC reference on the stack: {:#p}", gc_ref);
let is_new = self
.activations_table
Expand All @@ -272,26 +291,76 @@ impl DrcHeap {
}
}

/// Sweep the bump allocation table after we've discovered our precise stack
/// roots.
fn sweep(&mut self, host_data_table: &mut ExternRefHostDataTable) {
// Sweep our bump chunk.
fn iter_bump_chunk(&mut self) -> impl Iterator<Item = VMGcRef> + '_ {
let num_filled = self.activations_table.num_filled_in_bump_chunk();
self.activations_table
.alloc
.chunk
.iter_mut()
.take(num_filled)
.map(|slot| {
let r64 = *slot.get_mut();
VMGcRef::from_r64(r64)
.expect("valid r64")
.expect("non-null")
})
}

#[inline(never)]
#[cold]
fn log_gc_ref_set(prefix: &str, items: impl Iterator<Item = VMGcRef>) {
assert!(log::log_enabled!(log::Level::Trace));
let mut set = "{".to_string();
let mut any = false;
for gc_ref in items {
any = true;
set += &format!("\n {gc_ref:#p},");
}
if any {
set.push('\n');
}
set.push('}');
log::trace!("{prefix}: {set}");
}

fn drain_bump_chunk(&mut self, mut f: impl FnMut(&mut Self, VMGcRef)) {
let num_filled = self.activations_table.num_filled_in_bump_chunk();

// Temporarily take the allocation out of `self` to avoid conflicting
// borrows.
let mut alloc = mem::take(&mut self.activations_table.alloc);
for slot in alloc.chunk.iter_mut().take(num_filled) {
let r64 = mem::take(slot.get_mut());
let gc_ref = VMGcRef::from_r64(r64)
.expect("valid r64")
.expect("non-null");
self.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref);

f(self, gc_ref);
*slot.get_mut() = 0;
}

debug_assert!(
alloc.chunk.iter().all(|slot| unsafe { *slot.get() == 0 }),
"after sweeping the bump chunk, all slots should be empty"
alloc.chunk.iter_mut().all(|slot| *slot.get_mut() == 0),
"after sweeping the bump chunk, all slots should be empty",
);

debug_assert!(self.activations_table.alloc.chunk.is_empty());
self.activations_table.alloc = alloc;
}

/// Sweep the bump allocation table after we've discovered our precise stack
/// roots.
fn sweep(&mut self, host_data_table: &mut ExternRefHostDataTable) {
if log::log_enabled!(log::Level::Trace) {
Self::log_gc_ref_set("bump chunk before sweeping", self.iter_bump_chunk());
}

// Sweep our bump chunk.
log::trace!("Begin sweeping bump chunk");
self.drain_bump_chunk(|heap, gc_ref| {
heap.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref);
});
log::trace!("Done sweeping bump chunk");

if self.activations_table.alloc.chunk.is_empty() {
// If this is the first collection, then the bump chunk is empty
// since we lazily allocate it. Force that lazy allocation now so we
Expand All @@ -302,6 +371,16 @@ impl DrcHeap {
self.activations_table.alloc.reset();
}

if log::log_enabled!(log::Level::Trace) {
Self::log_gc_ref_set(
"hash set before sweeping",
self.activations_table
.over_approximated_stack_roots
.iter()
.map(|r| r.unchecked_copy()),
);
}

// The current `precise_stack_roots` becomes our new over-appoximated
// set for the next GC cycle.
mem::swap(
Expand All @@ -315,14 +394,26 @@ impl DrcHeap {
// Note that this may run arbitrary code as we run gc_ref
// destructors. Because of our `&mut` borrow above on this table,
// though, we're guaranteed that nothing will touch this table.
log::trace!("Begin sweeping hash set");
let mut precise_stack_roots = mem::take(&mut self.activations_table.precise_stack_roots);
for gc_ref in precise_stack_roots.drain() {
self.dec_ref_and_maybe_dealloc(host_data_table, &gc_ref);
}
log::trace!("Done sweeping hash set");

// Make sure to replace the `precise_stack_roots` so that we reuse any
// allocated capacity.
self.activations_table.precise_stack_roots = precise_stack_roots;

if log::log_enabled!(log::Level::Trace) {
Self::log_gc_ref_set(
"hash set after sweeping",
self.activations_table
.over_approximated_stack_roots
.iter()
.map(|r| r.unchecked_copy()),
);
}
}
}

Expand Down
17 changes: 10 additions & 7 deletions crates/runtime/src/gc/gc_runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,19 +311,22 @@ enum RawGcRoot {

impl GcRootsList {
/// Add a GC root that is inside a Wasm stack frame to this list.
//
// TODO: we shouldn't need to differentiate between on-stack GC roots and
// others, and both should just be `*mut VMGcRef`, but because Cranelift
// doesn't support `r32` on 64-bit platforms yet, we have to use `r64` for
// roots on the Wasm stack instead.
#[inline]
pub fn add_wasm_stack_root(&mut self, ptr_to_root: SendSyncPtr<u64>) {
pub unsafe fn add_wasm_stack_root(&mut self, ptr_to_root: SendSyncPtr<u64>) {
log::trace!(
"Adding Wasm stack root: {:#p}",
VMGcRef::from_r64(*ptr_to_root.as_ref()).unwrap().unwrap()
);
self.0.push(RawGcRoot::Stack(ptr_to_root));
}

/// Add a GC root to this list.
#[inline]
pub fn add_root(&mut self, ptr_to_root: SendSyncPtr<VMGcRef>) {
pub unsafe fn add_root(&mut self, ptr_to_root: SendSyncPtr<VMGcRef>) {
log::trace!(
"Adding non-stack root: {:#p}",
ptr_to_root.as_ref().unchecked_copy()
);
self.0.push(RawGcRoot::NonStack(ptr_to_root))
}

Expand Down
5 changes: 4 additions & 1 deletion crates/runtime/src/gc/host_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ impl ExternRefHostDataTable {
/// Allocate a new `externref` host data value.
pub fn alloc(&mut self, value: Box<dyn Any + Send + Sync>) -> ExternRefHostDataId {
let id = self.slab.alloc(value);
ExternRefHostDataId(id)
let id = ExternRefHostDataId(id);
log::trace!("allocated new externref host data: {id:?}");
id
}

/// Deallocate an `externref` host data value.
pub fn dealloc(&mut self, id: ExternRefHostDataId) -> Box<dyn Any + Send + Sync> {
log::trace!("deallocated externref host data: {id:?}");
self.slab.dealloc(id.0)
}

Expand Down
1 change: 1 addition & 0 deletions crates/runtime/src/libcalls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ unsafe fn drop_gc_ref(instance: &mut Instance, gc_ref: *mut u8) {
let gc_ref = VMGcRef::from_r64(u64::try_from(gc_ref as usize).unwrap())
.expect("valid r64")
.expect("non-null VMGcRef");
log::trace!("libcalls::drop_gc_ref({gc_ref:?})");
(*instance.store()).gc_store().drop_gc_ref(gc_ref);
}

Expand Down
Loading