Skip to content

Commit e402369

Browse files
committed
Auto merge of rust-lang#136115 - Mark-Simulacrum:shard-alloc-id, r=<try>
Shard AllocMap Lock This improves performance on many-seed parallel (-Zthreads=32) miri executions from managing to use ~8 cores to using 27-28 cores, which is about the same as what I see with the data structure proposed in rust-lang#136105 - I haven't analyzed but I suspect the sharding might actually work out better if we commonly insert "densely" since sharding would split the cache lines and the OnceVec packs locks close together. Of course, we could do something similar with the bitset lock too. Either way, this seems like a very reasonable starting point that solves the problem ~equally well on what I can test locally. r? `@RalfJung`
2 parents 0df0662 + b2bff4f commit e402369

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

compiler/rustc_middle/src/mir/interpret/mod.rs

+39-23
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ mod value;
1010

1111
use std::io::{Read, Write};
1212
use std::num::NonZero;
13+
use std::sync::atomic::AtomicU64;
1314
use std::{fmt, io};
1415

1516
use rustc_abi::{AddressSpace, Align, Endian, HasDataLayout, Size};
1617
use rustc_ast::{LitKind, Mutability};
1718
use rustc_data_structures::fx::FxHashMap;
19+
use rustc_data_structures::sharded::ShardedHashMap;
1820
use rustc_data_structures::sync::Lock;
1921
use rustc_hir::def::DefKind;
2022
use rustc_hir::def_id::{DefId, LocalDefId};
@@ -389,35 +391,37 @@ pub const CTFE_ALLOC_SALT: usize = 0;
389391

390392
pub(crate) struct AllocMap<'tcx> {
391393
/// Maps `AllocId`s to their corresponding allocations.
392-
alloc_map: FxHashMap<AllocId, GlobalAlloc<'tcx>>,
394+
// Note that this map on rustc workloads seems to be rather dense. In #136105 we considered
395+
// replacing it with a (dense) Vec-based map, but since there are workloads where it can be
396+
// sparse we decided to go with sharding for now. At least up to 32 cores there doesn't seem to
397+
// be much difference in performance on the one parallel workload we tested.
398+
to_alloc: ShardedHashMap<AllocId, GlobalAlloc<'tcx>>,
393399

394400
/// Used to deduplicate global allocations: functions, vtables, string literals, ...
395401
///
396402
/// The `usize` is a "salt" used by Miri to make deduplication imperfect, thus better emulating
397403
/// the actual guarantees.
398-
dedup: FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>,
404+
dedup: Lock<FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>>,
399405

400406
/// The `AllocId` to assign to the next requested ID.
401407
/// Always incremented; never gets smaller.
402-
next_id: AllocId,
408+
next_id: AtomicU64,
403409
}
404410

405411
impl<'tcx> AllocMap<'tcx> {
406412
pub(crate) fn new() -> Self {
407413
AllocMap {
408-
alloc_map: Default::default(),
414+
to_alloc: Default::default(),
409415
dedup: Default::default(),
410-
next_id: AllocId(NonZero::new(1).unwrap()),
416+
next_id: AtomicU64::new(1),
411417
}
412418
}
413-
fn reserve(&mut self) -> AllocId {
414-
let next = self.next_id;
415-
self.next_id.0 = self.next_id.0.checked_add(1).expect(
416-
"You overflowed a u64 by incrementing by 1... \
417-
You've just earned yourself a free drink if we ever meet. \
418-
Seriously, how did you do that?!",
419-
);
420-
next
419+
fn reserve(&self) -> AllocId {
420+
// Technically there is a window here where we overflow and then another thread
421+
// increments `next_id` *again* and uses it before we panic and tear down the entire session.
422+
// We consider this fine since such overflows cannot realistically occur.
423+
let next_id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
424+
AllocId(NonZero::new(next_id).unwrap())
421425
}
422426
}
423427

@@ -428,26 +432,33 @@ impl<'tcx> TyCtxt<'tcx> {
428432
/// Make sure to call `set_alloc_id_memory` or `set_alloc_id_same_memory` before returning such
429433
/// an `AllocId` from a query.
430434
pub fn reserve_alloc_id(self) -> AllocId {
431-
self.alloc_map.lock().reserve()
435+
self.alloc_map.reserve()
432436
}
433437

434438
/// Reserves a new ID *if* this allocation has not been dedup-reserved before.
435439
/// Should not be used for mutable memory.
436440
fn reserve_and_set_dedup(self, alloc: GlobalAlloc<'tcx>, salt: usize) -> AllocId {
437-
let mut alloc_map = self.alloc_map.lock();
438441
if let GlobalAlloc::Memory(mem) = alloc {
439442
if mem.inner().mutability.is_mut() {
440443
bug!("trying to dedup-reserve mutable memory");
441444
}
442445
}
443446
let alloc_salt = (alloc, salt);
444-
if let Some(&alloc_id) = alloc_map.dedup.get(&alloc_salt) {
447+
let mut dedup = self.alloc_map.dedup.lock();
448+
if let Some(&alloc_id) = dedup.get(&alloc_salt) {
445449
return alloc_id;
446450
}
447-
let id = alloc_map.reserve();
451+
let id = self.alloc_map.reserve();
448452
debug!("creating alloc {:?} with id {id:?}", alloc_salt.0);
449-
alloc_map.alloc_map.insert(id, alloc_salt.0.clone());
450-
alloc_map.dedup.insert(alloc_salt, id);
453+
// We just reserved, so should always be unique.
454+
assert!(
455+
self.alloc_map
456+
.to_alloc
457+
.lock_shard_by_value(&id)
458+
.insert(id, alloc_salt.0.clone())
459+
.is_none()
460+
);
461+
dedup.insert(alloc_salt, id);
451462
id
452463
}
453464

@@ -497,7 +508,7 @@ impl<'tcx> TyCtxt<'tcx> {
497508
/// local dangling pointers and allocations in constants/statics.
498509
#[inline]
499510
pub fn try_get_global_alloc(self, id: AllocId) -> Option<GlobalAlloc<'tcx>> {
500-
self.alloc_map.lock().alloc_map.get(&id).cloned()
511+
self.alloc_map.to_alloc.lock_shard_by_value(&id).get(&id).cloned()
501512
}
502513

503514
#[inline]
@@ -516,16 +527,21 @@ impl<'tcx> TyCtxt<'tcx> {
516527
/// Freezes an `AllocId` created with `reserve` by pointing it at an `Allocation`. Trying to
517528
/// call this function twice, even with the same `Allocation` will ICE the compiler.
518529
pub fn set_alloc_id_memory(self, id: AllocId, mem: ConstAllocation<'tcx>) {
519-
if let Some(old) = self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Memory(mem)) {
530+
if let Some(old) =
531+
self.alloc_map.to_alloc.lock_shard_by_value(&id).insert(id, GlobalAlloc::Memory(mem))
532+
{
520533
bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
521534
}
522535
}
523536

524537
/// Freezes an `AllocId` created with `reserve` by pointing it at a static item. Trying to
525538
/// call this function twice, even with the same `DefId` will ICE the compiler.
526539
pub fn set_nested_alloc_id_static(self, id: AllocId, def_id: LocalDefId) {
527-
if let Some(old) =
528-
self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Static(def_id.to_def_id()))
540+
if let Some(old) = self
541+
.alloc_map
542+
.to_alloc
543+
.lock_shard_by_value(&id)
544+
.insert(id, GlobalAlloc::Static(def_id.to_def_id()))
529545
{
530546
bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
531547
}

compiler/rustc_middle/src/ty/context.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -1348,7 +1348,7 @@ pub struct GlobalCtxt<'tcx> {
13481348
pub data_layout: TargetDataLayout,
13491349

13501350
/// Stores memory for globals (statics/consts).
1351-
pub(crate) alloc_map: Lock<interpret::AllocMap<'tcx>>,
1351+
pub(crate) alloc_map: interpret::AllocMap<'tcx>,
13521352
}
13531353

13541354
/// This is used to get a reference to a `GlobalCtxt` if one is available.
@@ -1538,7 +1538,7 @@ impl<'tcx> TyCtxt<'tcx> {
15381538
new_solver_evaluation_cache: Default::default(),
15391539
canonical_param_env_cache: Default::default(),
15401540
data_layout,
1541-
alloc_map: Lock::new(interpret::AllocMap::new()),
1541+
alloc_map: interpret::AllocMap::new(),
15421542
});
15431543

15441544
let icx = tls::ImplicitCtxt::new(&gcx);

0 commit comments

Comments
 (0)