From 7f1231c986c26dcd1b519b81b78371a750db797a Mon Sep 17 00:00:00 2001
From: Mark Rousskov <mark.simulacrum@gmail.com>
Date: Sun, 26 Jan 2025 20:47:49 -0500
Subject: [PATCH] Shard AllocMap Lock

This improves performance on many-seed parallel (-Zthreads=32) miri
executions from managing to use ~8 cores to using 27-28 cores. That's
pretty reasonable scaling for the simplicity of this solution.
---
 .../rustc_middle/src/mir/interpret/mod.rs     | 66 ++++++++++++-------
 compiler/rustc_middle/src/ty/context.rs       |  4 +-
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/compiler/rustc_middle/src/mir/interpret/mod.rs b/compiler/rustc_middle/src/mir/interpret/mod.rs
index f4f9f221a7511..45c862e0d34bf 100644
--- a/compiler/rustc_middle/src/mir/interpret/mod.rs
+++ b/compiler/rustc_middle/src/mir/interpret/mod.rs
@@ -15,7 +15,8 @@ use std::{fmt, io};
 use rustc_abi::{AddressSpace, Align, Endian, HasDataLayout, Size};
 use rustc_ast::{LitKind, Mutability};
 use rustc_data_structures::fx::FxHashMap;
-use rustc_data_structures::sync::Lock;
+use rustc_data_structures::sharded::ShardedHashMap;
+use rustc_data_structures::sync::{AtomicU64, Lock};
 use rustc_hir::def::DefKind;
 use rustc_hir::def_id::{DefId, LocalDefId};
 use rustc_macros::{HashStable, TyDecodable, TyEncodable, TypeFoldable, TypeVisitable};
@@ -389,35 +390,39 @@ pub const CTFE_ALLOC_SALT: usize = 0;
 
 pub(crate) struct AllocMap<'tcx> {
     /// Maps `AllocId`s to their corresponding allocations.
-    alloc_map: FxHashMap<AllocId, GlobalAlloc<'tcx>>,
+    // Note that this map on rustc workloads seems to be rather dense, but in miri workloads should
+    // be pretty sparse. In #136105 we considered replacing it with a (dense) Vec-based map, but
+    // since there are workloads where it can be sparse we decided to go with sharding for now. At
+    // least up to 32 cores the one workload tested didn't exhibit much difference between the two.
+    //
+    // Should be locked *after* locking dedup if locking both to avoid deadlocks.
+    to_alloc: ShardedHashMap<AllocId, GlobalAlloc<'tcx>>,
 
     /// Used to deduplicate global allocations: functions, vtables, string literals, ...
     ///
     /// The `usize` is a "salt" used by Miri to make deduplication imperfect, thus better emulating
     /// the actual guarantees.
-    dedup: FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>,
+    dedup: Lock<FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>>,
 
     /// The `AllocId` to assign to the next requested ID.
     /// Always incremented; never gets smaller.
-    next_id: AllocId,
+    next_id: AtomicU64,
 }
 
 impl<'tcx> AllocMap<'tcx> {
     pub(crate) fn new() -> Self {
         AllocMap {
-            alloc_map: Default::default(),
+            to_alloc: Default::default(),
             dedup: Default::default(),
-            next_id: AllocId(NonZero::new(1).unwrap()),
+            next_id: AtomicU64::new(1),
         }
     }
-    fn reserve(&mut self) -> AllocId {
-        let next = self.next_id;
-        self.next_id.0 = self.next_id.0.checked_add(1).expect(
-            "You overflowed a u64 by incrementing by 1... \
-             You've just earned yourself a free drink if we ever meet. \
-             Seriously, how did you do that?!",
-        );
-        next
+    fn reserve(&self) -> AllocId {
+        // Technically there is a window here where we overflow and then another thread
+        // increments `next_id` *again* and uses it before we panic and tear down the entire session.
+        // We consider this fine since such overflows cannot realistically occur.
+        let next_id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        AllocId(NonZero::new(next_id).unwrap())
     }
 }
 
@@ -428,26 +433,34 @@ impl<'tcx> TyCtxt<'tcx> {
     /// Make sure to call `set_alloc_id_memory` or `set_alloc_id_same_memory` before returning such
     /// an `AllocId` from a query.
     pub fn reserve_alloc_id(self) -> AllocId {
-        self.alloc_map.lock().reserve()
+        self.alloc_map.reserve()
     }
 
     /// Reserves a new ID *if* this allocation has not been dedup-reserved before.
     /// Should not be used for mutable memory.
     fn reserve_and_set_dedup(self, alloc: GlobalAlloc<'tcx>, salt: usize) -> AllocId {
-        let mut alloc_map = self.alloc_map.lock();
         if let GlobalAlloc::Memory(mem) = alloc {
             if mem.inner().mutability.is_mut() {
                 bug!("trying to dedup-reserve mutable memory");
             }
         }
         let alloc_salt = (alloc, salt);
-        if let Some(&alloc_id) = alloc_map.dedup.get(&alloc_salt) {
+        // Locking this *before* `to_alloc` also to ensure correct lock order.
+        let mut dedup = self.alloc_map.dedup.lock();
+        if let Some(&alloc_id) = dedup.get(&alloc_salt) {
             return alloc_id;
         }
-        let id = alloc_map.reserve();
+        let id = self.alloc_map.reserve();
         debug!("creating alloc {:?} with id {id:?}", alloc_salt.0);
-        alloc_map.alloc_map.insert(id, alloc_salt.0.clone());
-        alloc_map.dedup.insert(alloc_salt, id);
+        let had_previous = self
+            .alloc_map
+            .to_alloc
+            .lock_shard_by_value(&id)
+            .insert(id, alloc_salt.0.clone())
+            .is_some();
+        // We just reserved, so should always be unique.
+        assert!(!had_previous);
+        dedup.insert(alloc_salt, id);
         id
     }
 
@@ -497,7 +510,7 @@ impl<'tcx> TyCtxt<'tcx> {
     /// local dangling pointers and allocations in constants/statics.
     #[inline]
     pub fn try_get_global_alloc(self, id: AllocId) -> Option<GlobalAlloc<'tcx>> {
-        self.alloc_map.lock().alloc_map.get(&id).cloned()
+        self.alloc_map.to_alloc.lock_shard_by_value(&id).get(&id).cloned()
     }
 
     #[inline]
@@ -516,7 +529,9 @@ impl<'tcx> TyCtxt<'tcx> {
     /// Freezes an `AllocId` created with `reserve` by pointing it at an `Allocation`. Trying to
     /// call this function twice, even with the same `Allocation` will ICE the compiler.
     pub fn set_alloc_id_memory(self, id: AllocId, mem: ConstAllocation<'tcx>) {
-        if let Some(old) = self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Memory(mem)) {
+        if let Some(old) =
+            self.alloc_map.to_alloc.lock_shard_by_value(&id).insert(id, GlobalAlloc::Memory(mem))
+        {
             bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
         }
     }
@@ -524,8 +539,11 @@ impl<'tcx> TyCtxt<'tcx> {
     /// Freezes an `AllocId` created with `reserve` by pointing it at a static item. Trying to
     /// call this function twice, even with the same `DefId` will ICE the compiler.
     pub fn set_nested_alloc_id_static(self, id: AllocId, def_id: LocalDefId) {
-        if let Some(old) =
-            self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Static(def_id.to_def_id()))
+        if let Some(old) = self
+            .alloc_map
+            .to_alloc
+            .lock_shard_by_value(&id)
+            .insert(id, GlobalAlloc::Static(def_id.to_def_id()))
         {
             bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
         }
diff --git a/compiler/rustc_middle/src/ty/context.rs b/compiler/rustc_middle/src/ty/context.rs
index 69f6fc0ad8a66..0f1fc891b0f96 100644
--- a/compiler/rustc_middle/src/ty/context.rs
+++ b/compiler/rustc_middle/src/ty/context.rs
@@ -1366,7 +1366,7 @@ pub struct GlobalCtxt<'tcx> {
     pub data_layout: TargetDataLayout,
 
     /// Stores memory for globals (statics/consts).
-    pub(crate) alloc_map: Lock<interpret::AllocMap<'tcx>>,
+    pub(crate) alloc_map: interpret::AllocMap<'tcx>,
 
     current_gcx: CurrentGcx,
 }
@@ -1583,7 +1583,7 @@ impl<'tcx> TyCtxt<'tcx> {
             new_solver_evaluation_cache: Default::default(),
             canonical_param_env_cache: Default::default(),
             data_layout,
-            alloc_map: Lock::new(interpret::AllocMap::new()),
+            alloc_map: interpret::AllocMap::new(),
             current_gcx,
         });