From da58efb11df229eac6eb727c2884546310f9ffde Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sun, 5 May 2024 19:57:24 -0400 Subject: [PATCH] Improve VecCache under parallel frontend This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (<0.1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds). --- compiler/rustc_data_structures/src/lib.rs | 2 + .../rustc_data_structures/src/vec_cache.rs | 324 ++++++++++++++++++ .../src/vec_cache/tests.rs | 95 +++++ compiler/rustc_middle/src/query/keys.rs | 7 +- compiler/rustc_query_system/src/lib.rs | 1 + .../rustc_query_system/src/query/caches.rs | 94 ++--- 6 files changed, 458 insertions(+), 65 deletions(-) create mode 100644 compiler/rustc_data_structures/src/vec_cache.rs create mode 100644 compiler/rustc_data_structures/src/vec_cache/tests.rs diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs index afac08ae6f818..93abc0d29581d 100644 --- a/compiler/rustc_data_structures/src/lib.rs +++ b/compiler/rustc_data_structures/src/lib.rs @@ -22,6 +22,7 @@ #![feature(auto_traits)] #![feature(cfg_match)] #![feature(core_intrinsics)] +#![feature(dropck_eyepatch)] #![feature(extend_one)] #![feature(file_buffered)] #![feature(hash_raw_entry)] @@ -79,6 +80,7 @@ pub mod thinvec; pub mod transitive_relation; pub mod unhash; pub mod unord; +pub mod vec_cache; pub mod work_queue; mod atomic_ref; diff --git a/compiler/rustc_data_structures/src/vec_cache.rs b/compiler/rustc_data_structures/src/vec_cache.rs new file mode 100644 index 0000000000000..eb251b587c804 --- /dev/null +++ b/compiler/rustc_data_structures/src/vec_cache.rs @@ -0,0 +1,324 @@ +//! VecCache maintains a mapping from K -> (V, I) pairing. K and I must be roughly u32-sized, and V +//! must be Copy. +//! +//! VecCache supports efficient concurrent put/get across the key space, with write-once semantics +//! (i.e., a given key can only be put once). Subsequent puts will panic. +//! +//! This is currently used for query caching. + +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering}; + +use rustc_index::Idx; + +struct Slot { + // We never construct &Slot so it's fine for this to not be in an UnsafeCell. + value: V, + // This is both an index and a once-lock. + // + // 0: not yet initialized. + // 1: lock held, initializing. + // 2..u32::MAX - 2: initialized. + index_and_lock: AtomicU32, +} + +/// This uniquely identifies a single `Slot` entry in the buckets map, and provides accessors for +/// either getting the value or putting a value. +#[derive(Copy, Clone, Debug)] +struct SlotIndex { + // the index of the bucket in VecCache (0 to 20) + bucket_idx: usize, + // number of entries in that bucket + entries: usize, + // the index of the slot within the bucket + index_in_bucket: usize, +} + +// This makes sure the counts are consistent with what we allocate, precomputing each bucket a +// compile-time. Visiting all powers of two is enough to hit all the buckets. +// +// We confirm counts are accurate in the slot_index_exhaustive test. +const ENTRIES_BY_BUCKET: [usize; 21] = { + let mut entries = [0; 21]; + let mut key = 0; + loop { + let si = SlotIndex::from_index(key); + entries[si.bucket_idx] = si.entries; + if key == 0 { + key = 1; + } else if key == (1 << 31) { + break; + } else { + key <<= 1; + } + } + entries +}; + +impl SlotIndex { + // This unpacks a flat u32 index into identifying which bucket it belongs to and the offset + // within that bucket. As noted in the VecCache docs, buckets double in size with each index. + // Typically that would mean 31 buckets (2^0 + 2^1 ... + 2^31 = u32::MAX - 1), but to reduce + // the size of the VecCache struct and avoid uselessly small allocations, we instead have the + // first bucket have 2**12 entries. To simplify the math, the second bucket also 2**12 entries, + // and buckets double from there. + // + // We assert that [0, 2**32 - 1] uniquely map through this function to individual, consecutive + // slots (see `slot_index_exhaustive` in tests). + #[inline] + const fn from_index(idx: u32) -> Self { + let mut bucket = match idx.checked_ilog2() { + Some(x) => x as usize, + None => 0, + }; + let entries; + let running_sum; + if bucket <= 11 { + entries = 1 << 12; + running_sum = 0; + bucket = 0; + } else { + entries = 1 << bucket; + running_sum = entries; + bucket = bucket - 11; + } + SlotIndex { bucket_idx: bucket, entries, index_in_bucket: idx as usize - running_sum } + } + + // SAFETY: Buckets must be managed solely by functions here (i.e., get/put on SlotIndex) and + // `self` comes from SlotIndex::from_index + #[inline] + unsafe fn get(&self, buckets: &[AtomicPtr>; 21]) -> Option<(V, u32)> { + // SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., + // in-bounds of buckets. See `from_index` for computation. + let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; + let ptr = bucket.load(Ordering::Acquire); + // Bucket is not yet initialized: then we obviously won't find this entry in that bucket. + if ptr.is_null() { + return None; + } + assert!(self.index_in_bucket < self.entries); + // SAFETY: `bucket` was allocated (so <= isize in total bytes) to hold `entries`, so this + // must be inbounds. + let slot = unsafe { ptr.add(self.index_in_bucket) }; + + // SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for + // AtomicU32 access. + let index_and_lock = unsafe { &(*slot).index_and_lock }; + let current = index_and_lock.load(Ordering::Acquire); + let index = match current { + 0 => return None, + // Treat "initializing" as actually just not initialized at all. + // The only reason this is a separate state is that `complete` calls could race and + // we can't allow that, but from load perspective there's no difference. + 1 => return None, + _ => current - 2, + }; + + // SAFETY: + // * slot is a valid pointer (buckets are always valid for the index we get). + // * value is initialized since we saw a >= 2 index above. + // * `V: Copy`, so safe to read. + let value = unsafe { (*slot).value }; + Some((value, index)) + } + + fn bucket_ptr(&self, bucket: &AtomicPtr>) -> *mut Slot { + let ptr = bucket.load(Ordering::Acquire); + if ptr.is_null() { self.initialize_bucket(bucket) } else { ptr } + } + + #[cold] + fn initialize_bucket(&self, bucket: &AtomicPtr>) -> *mut Slot { + static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + // If we are initializing the bucket, then acquire a global lock. + // + // This path is quite cold, so it's cheap to use a global lock. This ensures that we never + // have multiple allocations for the same bucket. + let _allocator_guard = LOCK.lock().unwrap_or_else(|e| e.into_inner()); + + let ptr = bucket.load(Ordering::Acquire); + + // OK, now under the allocator lock, if we're still null then it's definitely us that will + // initialize this bucket. + if ptr.is_null() { + let bucket_layout = + std::alloc::Layout::array::>(self.entries as usize).unwrap(); + // This is more of a sanity check -- this code is very cold, so it's safe to pay a + // little extra cost here. + assert!(bucket_layout.size() > 0); + // SAFETY: Just checked that size is non-zero. + let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::>() }; + if allocated.is_null() { + std::alloc::handle_alloc_error(bucket_layout); + } + bucket.store(allocated, Ordering::Release); + allocated + } else { + // Otherwise some other thread initialized this bucket after we took the lock. In that + // case, just return early. + ptr + } + } + + /// Returns true if this successfully put into the map. + #[inline] + fn put(&self, buckets: &[AtomicPtr>; 21], value: V, extra: u32) -> bool { + // SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., + // in-bounds of buckets. + let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; + let ptr = self.bucket_ptr(bucket); + + assert!(self.index_in_bucket < self.entries); + // SAFETY: `bucket` was allocated (so <= isize in total bytes) to hold `entries`, so this + // must be inbounds. + let slot = unsafe { ptr.add(self.index_in_bucket) }; + + // SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for + // AtomicU32 access. + let index_and_lock = unsafe { &(*slot).index_and_lock }; + match index_and_lock.compare_exchange(0, 1, Ordering::AcqRel, Ordering::Acquire) { + Ok(_) => { + // We have acquired the initialization lock. It is our job to write `value` and + // then set the lock to the real index. + + unsafe { + (&raw mut (*slot).value).write(value); + } + + index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release); + + true + } + + // Treat "initializing" as the caller's fault. Callers are responsible for ensuring that + // there are no races on initialization. In the compiler's current usage for query + // caches, that's the "active query map" which ensures each query actually runs once + // (even if concurrently started). + Err(1) => panic!("caller raced calls to put()"), + + // This slot was already populated. Also ignore, currently this is the same as + // "initializing". + Err(_) => false, + } + } +} + +pub struct VecCache { + // Entries per bucket: + // Bucket 0: 4096 2^12 + // Bucket 1: 4096 2^12 + // Bucket 2: 8192 + // Bucket 3: 16384 + // ... + // Bucket 19: 1073741824 + // Bucket 20: 2147483648 + // The total number of entries if all buckets are initialized is u32::MAX-1. + buckets: [AtomicPtr>; 21], + + // In the compiler's current usage these are only *read* during incremental and self-profiling. + // They are an optimization over iterating the full buckets array. + present: [AtomicPtr>; 21], + len: AtomicUsize, + + key: PhantomData<(K, I)>, +} + +impl Default for VecCache { + fn default() -> Self { + VecCache { + buckets: Default::default(), + key: PhantomData, + len: Default::default(), + present: Default::default(), + } + } +} + +// SAFETY: No access to `V` is made. +unsafe impl Drop for VecCache { + fn drop(&mut self) { + // We have unique ownership, so no locks etc. are needed. Since `K` and `V` are both `Copy`, + // we are also guaranteed to just need to deallocate any large arrays (not iterate over + // contents). + // + // Confirm no need to deallocate invidual entries. Note that `V: Copy` is asserted on + // insert/lookup but not necessarily construction, primarily to avoid annoyingly propagating + // the bounds into struct definitions everywhere. + assert!(!std::mem::needs_drop::()); + assert!(!std::mem::needs_drop::()); + + for (idx, bucket) in self.buckets.iter().enumerate() { + let bucket = bucket.load(Ordering::Acquire); + if !bucket.is_null() { + let layout = std::alloc::Layout::array::>(ENTRIES_BY_BUCKET[idx]).unwrap(); + unsafe { + std::alloc::dealloc(bucket.cast(), layout); + } + } + } + + for (idx, bucket) in self.present.iter().enumerate() { + let bucket = bucket.load(Ordering::Acquire); + if !bucket.is_null() { + let layout = std::alloc::Layout::array::>(ENTRIES_BY_BUCKET[idx]).unwrap(); + unsafe { + std::alloc::dealloc(bucket.cast(), layout); + } + } + } + } +} + +impl VecCache +where + K: Eq + Idx + Copy + Debug, + V: Copy, + I: Idx + Copy, +{ + #[inline(always)] + pub fn lookup(&self, key: &K) -> Option<(V, I)> { + let key = u32::try_from(key.index()).unwrap(); + let slot_idx = SlotIndex::from_index(key); + match unsafe { slot_idx.get(&self.buckets) } { + Some((value, idx)) => Some((value, I::new(idx as usize))), + None => None, + } + } + + #[inline] + pub fn complete(&self, key: K, value: V, index: I) { + let key = u32::try_from(key.index()).unwrap(); + let slot_idx = SlotIndex::from_index(key); + if slot_idx.put(&self.buckets, value, index.index() as u32) { + let present_idx = self.len.fetch_add(1, Ordering::Relaxed); + let slot = SlotIndex::from_index(present_idx as u32); + // We should always be uniquely putting due to `len` fetch_add returning unique values. + assert!(slot.put(&self.present, (), key)); + } + } + + pub fn iter(&self, f: &mut dyn FnMut(&K, &V, I)) { + for idx in 0..self.len.load(Ordering::Acquire) { + let key = SlotIndex::from_index(idx as u32); + match unsafe { key.get(&self.present) } { + // This shouldn't happen in our current usage (iter is really only + // used long after queries are done running), but if we hit this in practice it's + // probably fine to just break early. + None => unreachable!(), + Some(((), key)) => { + let key = K::new(key as usize); + // unwrap() is OK: present entries are always written only after we put the real + // entry. + let value = self.lookup(&key).unwrap(); + f(&key, &value.0, value.1); + } + } + } + } +} + +#[cfg(test)] +mod tests; diff --git a/compiler/rustc_data_structures/src/vec_cache/tests.rs b/compiler/rustc_data_structures/src/vec_cache/tests.rs new file mode 100644 index 0000000000000..a05f274136209 --- /dev/null +++ b/compiler/rustc_data_structures/src/vec_cache/tests.rs @@ -0,0 +1,95 @@ +use super::*; + +#[test] +#[cfg(not(miri))] +fn vec_cache_empty() { + let cache: VecCache = VecCache::default(); + for key in 0..u32::MAX { + assert!(cache.lookup(&key).is_none()); + } +} + +#[test] +fn vec_cache_insert_and_check() { + let cache: VecCache = VecCache::default(); + cache.complete(0, 1, 2); + assert_eq!(cache.lookup(&0), Some((1, 2))); +} + +#[test] +fn sparse_inserts() { + let cache: VecCache = VecCache::default(); + let end = if cfg!(target_pointer_width = "64") && cfg!(target_os = "linux") { + // For paged memory, 64-bit systems we should be able to sparsely allocate all of the pages + // needed for these inserts cheaply (without needing to actually have gigabytes of resident + // memory). + 31 + } else { + // Otherwise, still run the test but scaled back: + // + // Each slot is 5 bytes, so 2^25 entries (on non-virtual memory systems, like e.g. Windows) will + // mean 160 megabytes of allocated memory. Going beyond that is probably not reasonable for + // tests. + 25 + }; + for shift in 0..end { + let key = 1u32 << shift; + cache.complete(key, shift, key); + assert_eq!(cache.lookup(&key), Some((shift, key))); + } +} + +#[test] +fn concurrent_stress_check() { + let cache: VecCache = VecCache::default(); + std::thread::scope(|s| { + for idx in 0..100 { + let cache = &cache; + s.spawn(move || { + cache.complete(idx, idx, idx); + }); + } + }); + + for idx in 0..100 { + assert_eq!(cache.lookup(&idx), Some((idx, idx))); + } +} + +#[test] +fn slot_entries_table() { + assert_eq!(ENTRIES_BY_BUCKET, [ + 4096, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, + 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, + 2147483648 + ]); +} + +#[test] +#[cfg(not(miri))] +fn slot_index_exhaustive() { + let mut buckets = [0u32; 21]; + for idx in 0..=u32::MAX { + buckets[SlotIndex::from_index(idx).bucket_idx] += 1; + } + let mut prev = None::; + for idx in 0..=u32::MAX { + let slot_idx = SlotIndex::from_index(idx); + if let Some(p) = prev { + if p.bucket_idx == slot_idx.bucket_idx { + assert_eq!(p.index_in_bucket + 1, slot_idx.index_in_bucket); + } else { + assert_eq!(slot_idx.index_in_bucket, 0); + } + } else { + assert_eq!(idx, 0); + assert_eq!(slot_idx.index_in_bucket, 0); + assert_eq!(slot_idx.bucket_idx, 0); + } + + assert_eq!(buckets[slot_idx.bucket_idx], slot_idx.entries as u32); + assert_eq!(ENTRIES_BY_BUCKET[slot_idx.bucket_idx], slot_idx.entries, "{}", idx); + + prev = Some(slot_idx); + } +} diff --git a/compiler/rustc_middle/src/query/keys.rs b/compiler/rustc_middle/src/query/keys.rs index fe28ef0f70c79..1fbebf4b09328 100644 --- a/compiler/rustc_middle/src/query/keys.rs +++ b/compiler/rustc_middle/src/query/keys.rs @@ -2,6 +2,7 @@ use rustc_hir::def_id::{CrateNum, DefId, LOCAL_CRATE, LocalDefId, LocalModDefId, ModDefId}; use rustc_hir::hir_id::{HirId, OwnerId}; +use rustc_query_system::dep_graph::DepNodeIndex; use rustc_query_system::query::{DefIdCache, DefaultCache, SingleCache, VecCache}; use rustc_span::symbol::{Ident, Symbol}; use rustc_span::{DUMMY_SP, Span}; @@ -110,7 +111,7 @@ impl<'tcx> Key for mir::interpret::LitToConstInput<'tcx> { } impl Key for CrateNum { - type Cache = VecCache; + type Cache = VecCache; fn default_span(&self, _: TyCtxt<'_>) -> Span { DUMMY_SP @@ -127,7 +128,7 @@ impl AsLocalKey for CrateNum { } impl Key for OwnerId { - type Cache = VecCache; + type Cache = VecCache; fn default_span(&self, tcx: TyCtxt<'_>) -> Span { self.to_def_id().default_span(tcx) @@ -139,7 +140,7 @@ impl Key for OwnerId { } impl Key for LocalDefId { - type Cache = VecCache; + type Cache = VecCache; fn default_span(&self, tcx: TyCtxt<'_>) -> Span { self.to_def_id().default_span(tcx) diff --git a/compiler/rustc_query_system/src/lib.rs b/compiler/rustc_query_system/src/lib.rs index ba7a631fb5419..a85e8a55a21a9 100644 --- a/compiler/rustc_query_system/src/lib.rs +++ b/compiler/rustc_query_system/src/lib.rs @@ -2,6 +2,7 @@ #![allow(rustc::potential_query_instability, internal_features)] #![feature(assert_matches)] #![feature(core_intrinsics)] +#![feature(dropck_eyepatch)] #![feature(hash_raw_entry)] #![feature(let_chains)] #![feature(min_specialization)] diff --git a/compiler/rustc_query_system/src/query/caches.rs b/compiler/rustc_query_system/src/query/caches.rs index a4ced3d2c2452..e6f3d97742d31 100644 --- a/compiler/rustc_query_system/src/query/caches.rs +++ b/compiler/rustc_query_system/src/query/caches.rs @@ -3,9 +3,10 @@ use std::hash::Hash; use rustc_data_structures::fx::FxHashMap; use rustc_data_structures::sharded::{self, Sharded}; -use rustc_data_structures::sync::{Lock, OnceLock}; +use rustc_data_structures::sync::OnceLock; +pub use rustc_data_structures::vec_cache::VecCache; use rustc_hir::def_id::LOCAL_CRATE; -use rustc_index::{Idx, IndexVec}; +use rustc_index::Idx; use rustc_span::def_id::{DefId, DefIndex}; use crate::dep_graph::DepNodeIndex; @@ -100,52 +101,10 @@ where } } -pub struct VecCache { - cache: Lock>>, -} - -impl Default for VecCache { - fn default() -> Self { - VecCache { cache: Default::default() } - } -} - -impl QueryCache for VecCache -where - K: Eq + Idx + Copy + Debug, - V: Copy, -{ - type Key = K; - type Value = V; - - #[inline(always)] - fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> { - let lock = self.cache.lock(); - if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None } - } - - #[inline] - fn complete(&self, key: K, value: V, index: DepNodeIndex) { - let mut lock = self.cache.lock(); - lock.insert(key, (value, index)); - } - - fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) { - for (k, v) in self.cache.lock().iter_enumerated() { - if let Some(v) = v { - f(&k, &v.0, v.1); - } - } - } -} - pub struct DefIdCache { /// Stores the local DefIds in a dense map. Local queries are much more often dense, so this is /// a win over hashing query keys at marginal memory cost (~5% at most) compared to FxHashMap. - /// - /// The second element of the tuple is the set of keys actually present in the IndexVec, used - /// for faster iteration in `iter()`. - local: Lock<(IndexVec>, Vec)>, + local: VecCache, foreign: DefaultCache, } @@ -165,8 +124,7 @@ where #[inline(always)] fn lookup(&self, key: &DefId) -> Option<(V, DepNodeIndex)> { if key.krate == LOCAL_CRATE { - let cache = self.local.lock(); - cache.0.get(key.index).and_then(|v| *v) + self.local.lookup(&key.index) } else { self.foreign.lookup(key) } @@ -175,27 +133,39 @@ where #[inline] fn complete(&self, key: DefId, value: V, index: DepNodeIndex) { if key.krate == LOCAL_CRATE { - let mut cache = self.local.lock(); - let (cache, present) = &mut *cache; - let slot = cache.ensure_contains_elem(key.index, Default::default); - if slot.is_none() { - // FIXME: Only store the present set when running in incremental mode. `iter` is not - // used outside of saving caches to disk and self-profile. - present.push(key.index); - } - *slot = Some((value, index)); + self.local.complete(key.index, value, index) } else { self.foreign.complete(key, value, index) } } fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) { - let guard = self.local.lock(); - let (cache, present) = &*guard; - for &idx in present.iter() { - let value = cache[idx].unwrap(); - f(&DefId { krate: LOCAL_CRATE, index: idx }, &value.0, value.1); - } + self.local.iter(&mut |key, value, index| { + f(&DefId { krate: LOCAL_CRATE, index: *key }, value, index); + }); self.foreign.iter(f); } } + +impl QueryCache for VecCache +where + K: Idx + Eq + Hash + Copy + Debug, + V: Copy, +{ + type Key = K; + type Value = V; + + #[inline(always)] + fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> { + self.lookup(key) + } + + #[inline] + fn complete(&self, key: K, value: V, index: DepNodeIndex) { + self.complete(key, value, index) + } + + fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) { + self.iter(f) + } +}