Skip to content

Commit 0ebe54d

Browse files
Merge commit from fork
* Fix a race condition in the Wasm type registry Under certain concurrent event orderings, the type registry was susceptible to double-unregistration bugs due to a race condition. Consider the following example where we have threads `A` and `B`, an existing rec group entry `E`, and another rec group entry `E'` that is a duplicate of `E`: | Thread A | Thread B | |-----------------------------------|--------------------------------| | `acquire(type registry lock)` | | | | `decref(E) --> 0` | | | `block_on(type registry lock)` | | `register(E') == incref(E) --> 1` | | | `release(type registry lock)` | | | `decref(E) --> 0` | | | `acquire(type registry lock)` | | | `unregister(E)` | | | `release(type registry lock)` | | | | `acquire(type registry lock)` | | | `unregister(E)` !!!!!! | As you can see, in the above event ordering, we would unregister `E` twice, leading to panics and potential type registry corruption. This commit adds an `unregistered` flag to each entry which is set when we commit to unregistering the entry and while we hold the type registry lock. When we are considering unregistering an entry at the beginning of `TypeRegistryInner::unregister_entry`, because we observed a zero-registrations count for that entry and then waited on the type registry's lock, we now check if that flag is already set (by some concurrent unregistration that happened between observing the zero-registrations count and acquiring the lock) before we actually perform the unregistration. Furthermore, in the process of writing a smoke test for concurrent module (and therefore type) loading and unloading, I discovered an index out-of-bounds panic during Wasm-to-CLIF module translation. This commit also includes the one-line fix for that bug and a `.wast` regression test for it as well. * Update test to trigger case requiring `unregistered` flag * Add another test further stressing concurrent interactions Try to get a module's function type to get typed the wrong way and/or get wasm to call with the wrong signature. --------- Co-authored-by: Alex Crichton <alex@alexcrichton.com>
1 parent bc6b0fe commit 0ebe54d

File tree

2 files changed

+417
-27
lines changed

2 files changed

+417
-27
lines changed

Diff for: crates/wasmtime/src/runtime/type_registry.rs

+181-27
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ use core::{
1717
hash::{Hash, Hasher},
1818
ops::Range,
1919
sync::atomic::{
20-
AtomicUsize,
21-
Ordering::{AcqRel, Acquire},
20+
AtomicBool, AtomicUsize,
21+
Ordering::{AcqRel, Acquire, Release},
2222
},
2323
};
2424
use wasmtime_environ::{
25-
iter_entity_range, packed_option::PackedOption, EngineOrModuleTypeIndex, GcLayout,
26-
ModuleInternedTypeIndex, ModuleTypes, PrimaryMap, SecondaryMap, TypeTrace, VMSharedTypeIndex,
27-
WasmRecGroup, WasmSubType,
25+
iter_entity_range,
26+
packed_option::{PackedOption, ReservedValue},
27+
EngineOrModuleTypeIndex, GcLayout, ModuleInternedTypeIndex, ModuleTypes, PrimaryMap,
28+
SecondaryMap, TypeTrace, VMSharedTypeIndex, WasmRecGroup, WasmSubType,
2829
};
2930
use wasmtime_slab::{Id as SlabId, Slab};
3031

@@ -180,12 +181,15 @@ impl Drop for TypeCollection {
180181

181182
#[inline]
182183
fn shared_type_index_to_slab_id(index: VMSharedTypeIndex) -> SlabId {
184+
assert!(!index.is_reserved_value());
183185
SlabId::from_raw(index.bits())
184186
}
185187

186188
#[inline]
187189
fn slab_id_to_shared_type_index(id: SlabId) -> VMSharedTypeIndex {
188-
VMSharedTypeIndex::new(id.into_raw())
190+
let index = VMSharedTypeIndex::new(id.into_raw());
191+
assert!(!index.is_reserved_value());
192+
index
189193
}
190194

191195
/// A Wasm type that has been registered in the engine's `TypeRegistry`.
@@ -417,8 +421,25 @@ impl Debug for RecGroupEntry {
417421
struct RecGroupEntryInner {
418422
/// The Wasm rec group, canonicalized for hash consing.
419423
hash_consing_key: WasmRecGroup,
424+
425+
/// The shared type indices for each type in this rec group.
420426
shared_type_indices: Box<[VMSharedTypeIndex]>,
427+
428+
/// The number of times that this entry has been registered in the
429+
/// `TypeRegistryInner`.
430+
///
431+
/// This is an atomic counter so that cloning a `RegisteredType`, and
432+
/// temporarily keeping a type registered, doesn't require locking the full
433+
/// registry.
421434
registrations: AtomicUsize,
435+
436+
/// Whether this entry has already been unregistered from the
437+
/// `TypeRegistryInner`.
438+
///
439+
/// This flag exists to detect and avoid double-unregistration bugs that
440+
/// could otherwise occur in rare cases. See the comments in
441+
/// `TypeRegistryInner::unregister_type` for details.
442+
unregistered: AtomicBool,
422443
}
423444

424445
impl PartialEq for RecGroupEntry {
@@ -611,6 +632,7 @@ impl TypeRegistryInner {
611632

612633
// If we've already registered this rec group before, reuse it.
613634
if let Some(entry) = self.hash_consing_map.get(&hash_consing_key) {
635+
assert_eq!(entry.0.unregistered.load(Acquire), false);
614636
entry.incref(
615637
"hash consed to already-registered type in `TypeRegistryInner::register_rec_group`",
616638
);
@@ -622,8 +644,9 @@ impl TypeRegistryInner {
622644
// while this rec group is still alive.
623645
hash_consing_key
624646
.trace_engine_indices::<_, ()>(&mut |index| {
625-
let entry = &self.type_to_rec_group[index].as_ref().unwrap();
626-
entry.incref(
647+
let other_entry = &self.type_to_rec_group[index].as_ref().unwrap();
648+
assert_eq!(other_entry.0.unregistered.load(Acquire), false);
649+
other_entry.incref(
627650
"new cross-group type reference to existing type in `register_rec_group`",
628651
);
629652
Ok(())
@@ -645,17 +668,32 @@ impl TypeRegistryInner {
645668
map[idx]
646669
} else {
647670
let rec_group_offset = idx.as_u32() - module_rec_group_start.as_u32();
648-
VMSharedTypeIndex::from_u32(engine_rec_group_start + rec_group_offset)
671+
let index =
672+
VMSharedTypeIndex::from_u32(engine_rec_group_start + rec_group_offset);
673+
assert!(!index.is_reserved_value());
674+
index
649675
}
650676
});
651677
self.insert_one_type_from_rec_group(gc_runtime, module_index, ty)
652678
})
653679
.collect();
654680

681+
debug_assert_eq!(
682+
shared_type_indices.len(),
683+
shared_type_indices
684+
.iter()
685+
.copied()
686+
.inspect(|ty| assert!(!ty.is_reserved_value()))
687+
.collect::<crate::hash_set::HashSet<_>>()
688+
.len(),
689+
"should not have any duplicate type indices",
690+
);
691+
655692
let entry = RecGroupEntry(Arc::new(RecGroupEntryInner {
656693
hash_consing_key,
657694
shared_type_indices,
658695
registrations: AtomicUsize::new(1),
696+
unregistered: AtomicBool::new(false),
659697
}));
660698
log::trace!("create new entry {entry:?} (registrations -> 1)");
661699

@@ -845,29 +883,133 @@ impl TypeRegistryInner {
845883
/// zero remaining registrations.
846884
fn unregister_entry(&mut self, entry: RecGroupEntry) {
847885
debug_assert!(self.drop_stack.is_empty());
886+
887+
// There are two races to guard against before we can unregister the
888+
// entry, even though it was on the drop stack:
889+
//
890+
// 1. Although an entry has to reach zero registrations before it is
891+
// enqueued in the drop stack, we need to double check whether the
892+
// entry is *still* at zero registrations. This is because someone
893+
// else can resurrect the entry in between when the
894+
// zero-registrations count was first observed and when we actually
895+
// acquire the lock to unregister it. In this example, we have
896+
// threads A and B, an existing rec group entry E, and a rec group
897+
// entry E' that is a duplicate of E:
898+
//
899+
// Thread A | Thread B
900+
// --------------------------------+-----------------------------
901+
// acquire(type registry lock) |
902+
// |
903+
// | decref(E) --> 0
904+
// |
905+
// | block_on(type registry lock)
906+
// |
907+
// register(E') == incref(E) --> 1 |
908+
// |
909+
// release(type registry lock) |
910+
// |
911+
// | acquire(type registry lock)
912+
// |
913+
// | unregister(E) !!!!!!
914+
//
915+
// If we aren't careful, we can unregister a type while it is still
916+
// in use!
917+
//
918+
// The fix in this case is that we skip unregistering the entry if
919+
// its reference count is non-zero, since that means it was
920+
// concurrently resurrected and is now in use again.
921+
//
922+
// 2. In a slightly more convoluted version of (1), where an entry is
923+
// resurrected but then dropped *again*, someone might attempt to
924+
// unregister an entry a second time:
925+
//
926+
// Thread A | Thread B
927+
// --------------------------------|-----------------------------
928+
// acquire(type registry lock) |
929+
// |
930+
// | decref(E) --> 0
931+
// |
932+
// | block_on(type registry lock)
933+
// |
934+
// register(E') == incref(E) --> 1 |
935+
// |
936+
// release(type registry lock) |
937+
// |
938+
// decref(E) --> 0 |
939+
// |
940+
// acquire(type registry lock) |
941+
// |
942+
// unregister(E) |
943+
// |
944+
// release(type registry lock) |
945+
// |
946+
// | acquire(type registry lock)
947+
// |
948+
// | unregister(E) !!!!!!
949+
//
950+
// If we aren't careful, we can unregister a type twice, which leads
951+
// to panics and registry corruption!
952+
//
953+
// To detect this scenario and avoid the double-unregistration bug,
954+
// we maintain an `unregistered` flag on entries. We set this flag
955+
// once an entry is unregistered and therefore, even if it is
956+
// enqueued in the drop stack multiple times, we only actually
957+
// unregister the entry the first time.
958+
//
959+
// A final note: we don't need to worry about any concurrent
960+
// modifications during the middle of this function's execution, only
961+
// between (a) when we first observed a zero-registrations count and
962+
// decided to unregister the type, and (b) when we acquired the type
963+
// registry's lock so that we could perform that unregistration. This is
964+
// because this method has exclusive access to `&mut self` -- that is,
965+
// we have a write lock on the whole type registry -- and therefore no
966+
// one else can create new references to this zero-registration entry
967+
// and bring it back to life (which would require finding it in
968+
// `self.hash_consing_map`, which no one else has access to, because we
969+
// now have an exclusive lock on `self`).
970+
971+
// Handle scenario (1) from above.
972+
let registrations = entry.0.registrations.load(Acquire);
973+
if registrations != 0 {
974+
log::trace!(
975+
"{entry:?} was concurrently resurrected and no longer has \
976+
zero registrations (registrations -> {registrations})",
977+
);
978+
assert_eq!(entry.0.unregistered.load(Acquire), false);
979+
return;
980+
}
981+
982+
// Handle scenario (2) from above.
983+
if entry.0.unregistered.load(Acquire) {
984+
log::trace!(
985+
"{entry:?} was concurrently resurrected, dropped again, \
986+
and already unregistered"
987+
);
988+
return;
989+
}
990+
991+
// Okay, we are really going to unregister this entry. Enqueue it on the
992+
// drop stack.
848993
self.drop_stack.push(entry);
849994

995+
// Keep unregistering entries until the drop stack is empty. This is
996+
// logically a recursive process where if we unregister a type that was
997+
// the only thing keeping another type alive, we then recursively
998+
// unregister that other type as well. However, we use this explicit
999+
// drop stack to avoid recursion and the potential stack overflows that
1000+
// recursion implies.
8501001
while let Some(entry) = self.drop_stack.pop() {
8511002
log::trace!("Start unregistering {entry:?}");
8521003

853-
// We need to double check whether the entry is still at zero
854-
// registrations: Between the time that we observed a zero and
855-
// acquired the lock to call this function, another thread could
856-
// have registered the type and found the 0-registrations entry in
857-
// `self.map` and incremented its count.
858-
//
859-
// We don't need to worry about any concurrent increments during
860-
// this function's invocation after we check for zero because we
861-
// have exclusive access to `&mut self` and therefore no one can
862-
// create a new reference to this entry and bring it back to life.
863-
let registrations = entry.0.registrations.load(Acquire);
864-
if registrations != 0 {
865-
log::trace!(
866-
"{entry:?} was concurrently resurrected and no longer has \
867-
zero registrations (registrations -> {registrations})",
868-
);
869-
continue;
870-
}
1004+
// All entries on the drop stack should *really* be ready for
1005+
// unregistration, since no one can resurrect entries once we've
1006+
// locked the registry.
1007+
assert_eq!(entry.0.registrations.load(Acquire), 0);
1008+
assert_eq!(entry.0.unregistered.load(Acquire), false);
1009+
1010+
// We are taking responsibility for unregistering this entry, so
1011+
// prevent anyone else from attempting to do it again.
1012+
entry.0.unregistered.store(true, Release);
8711013

8721014
// Decrement any other types that this type was shallowly
8731015
// (i.e. non-transitively) referencing and keeping alive. If this
@@ -901,6 +1043,18 @@ impl TypeRegistryInner {
9011043
// map. Additionally, stop holding a strong reference from each
9021044
// function type in the rec group to that function type's trampoline
9031045
// type.
1046+
debug_assert_eq!(
1047+
entry.0.shared_type_indices.len(),
1048+
entry
1049+
.0
1050+
.shared_type_indices
1051+
.iter()
1052+
.copied()
1053+
.inspect(|ty| assert!(!ty.is_reserved_value()))
1054+
.collect::<crate::hash_set::HashSet<_>>()
1055+
.len(),
1056+
"should not have any duplicate type indices",
1057+
);
9041058
for ty in entry.0.shared_type_indices.iter().copied() {
9051059
log::trace!("removing {ty:?} from registry");
9061060

0 commit comments

Comments
 (0)