-
Notifications
You must be signed in to change notification settings - Fork 12.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto merge of #124780 - Mark-Simulacrum:lockless-cache, r=lcnr
Improve VecCache under parallel frontend This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (~1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds). try-job: i686-gnu-nopt try-job: dist-x86_64-linux
- Loading branch information
Showing
6 changed files
with
458 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,324 @@ | ||
//! VecCache maintains a mapping from K -> (V, I) pairing. K and I must be roughly u32-sized, and V | ||
//! must be Copy. | ||
//! | ||
//! VecCache supports efficient concurrent put/get across the key space, with write-once semantics | ||
//! (i.e., a given key can only be put once). Subsequent puts will panic. | ||
//! | ||
//! This is currently used for query caching. | ||
use std::fmt::Debug; | ||
use std::marker::PhantomData; | ||
use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicUsize, Ordering}; | ||
|
||
use rustc_index::Idx; | ||
|
||
struct Slot<V> { | ||
// We never construct &Slot<V> so it's fine for this to not be in an UnsafeCell. | ||
value: V, | ||
// This is both an index and a once-lock. | ||
// | ||
// 0: not yet initialized. | ||
// 1: lock held, initializing. | ||
// 2..u32::MAX - 2: initialized. | ||
index_and_lock: AtomicU32, | ||
} | ||
|
||
/// This uniquely identifies a single `Slot<V>` entry in the buckets map, and provides accessors for | ||
/// either getting the value or putting a value. | ||
#[derive(Copy, Clone, Debug)] | ||
struct SlotIndex { | ||
// the index of the bucket in VecCache (0 to 20) | ||
bucket_idx: usize, | ||
// number of entries in that bucket | ||
entries: usize, | ||
// the index of the slot within the bucket | ||
index_in_bucket: usize, | ||
} | ||
|
||
// This makes sure the counts are consistent with what we allocate, precomputing each bucket a | ||
// compile-time. Visiting all powers of two is enough to hit all the buckets. | ||
// | ||
// We confirm counts are accurate in the slot_index_exhaustive test. | ||
const ENTRIES_BY_BUCKET: [usize; 21] = { | ||
let mut entries = [0; 21]; | ||
let mut key = 0; | ||
loop { | ||
let si = SlotIndex::from_index(key); | ||
entries[si.bucket_idx] = si.entries; | ||
if key == 0 { | ||
key = 1; | ||
} else if key == (1 << 31) { | ||
break; | ||
} else { | ||
key <<= 1; | ||
} | ||
} | ||
entries | ||
}; | ||
|
||
impl SlotIndex { | ||
// This unpacks a flat u32 index into identifying which bucket it belongs to and the offset | ||
// within that bucket. As noted in the VecCache docs, buckets double in size with each index. | ||
// Typically that would mean 31 buckets (2^0 + 2^1 ... + 2^31 = u32::MAX - 1), but to reduce | ||
// the size of the VecCache struct and avoid uselessly small allocations, we instead have the | ||
// first bucket have 2**12 entries. To simplify the math, the second bucket also 2**12 entries, | ||
// and buckets double from there. | ||
// | ||
// We assert that [0, 2**32 - 1] uniquely map through this function to individual, consecutive | ||
// slots (see `slot_index_exhaustive` in tests). | ||
#[inline] | ||
const fn from_index(idx: u32) -> Self { | ||
let mut bucket = match idx.checked_ilog2() { | ||
Some(x) => x as usize, | ||
None => 0, | ||
}; | ||
let entries; | ||
let running_sum; | ||
if bucket <= 11 { | ||
entries = 1 << 12; | ||
running_sum = 0; | ||
bucket = 0; | ||
} else { | ||
entries = 1 << bucket; | ||
running_sum = entries; | ||
bucket = bucket - 11; | ||
} | ||
SlotIndex { bucket_idx: bucket, entries, index_in_bucket: idx as usize - running_sum } | ||
} | ||
|
||
// SAFETY: Buckets must be managed solely by functions here (i.e., get/put on SlotIndex) and | ||
// `self` comes from SlotIndex::from_index | ||
#[inline] | ||
unsafe fn get<V: Copy>(&self, buckets: &[AtomicPtr<Slot<V>>; 21]) -> Option<(V, u32)> { | ||
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., | ||
// in-bounds of buckets. See `from_index` for computation. | ||
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; | ||
let ptr = bucket.load(Ordering::Acquire); | ||
// Bucket is not yet initialized: then we obviously won't find this entry in that bucket. | ||
if ptr.is_null() { | ||
return None; | ||
} | ||
assert!(self.index_in_bucket < self.entries); | ||
// SAFETY: `bucket` was allocated (so <= isize in total bytes) to hold `entries`, so this | ||
// must be inbounds. | ||
let slot = unsafe { ptr.add(self.index_in_bucket) }; | ||
|
||
// SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for | ||
// AtomicU32 access. | ||
let index_and_lock = unsafe { &(*slot).index_and_lock }; | ||
let current = index_and_lock.load(Ordering::Acquire); | ||
let index = match current { | ||
0 => return None, | ||
// Treat "initializing" as actually just not initialized at all. | ||
// The only reason this is a separate state is that `complete` calls could race and | ||
// we can't allow that, but from load perspective there's no difference. | ||
1 => return None, | ||
_ => current - 2, | ||
}; | ||
|
||
// SAFETY: | ||
// * slot is a valid pointer (buckets are always valid for the index we get). | ||
// * value is initialized since we saw a >= 2 index above. | ||
// * `V: Copy`, so safe to read. | ||
let value = unsafe { (*slot).value }; | ||
Some((value, index)) | ||
} | ||
|
||
fn bucket_ptr<V>(&self, bucket: &AtomicPtr<Slot<V>>) -> *mut Slot<V> { | ||
let ptr = bucket.load(Ordering::Acquire); | ||
if ptr.is_null() { self.initialize_bucket(bucket) } else { ptr } | ||
} | ||
|
||
#[cold] | ||
fn initialize_bucket<V>(&self, bucket: &AtomicPtr<Slot<V>>) -> *mut Slot<V> { | ||
static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); | ||
|
||
// If we are initializing the bucket, then acquire a global lock. | ||
// | ||
// This path is quite cold, so it's cheap to use a global lock. This ensures that we never | ||
// have multiple allocations for the same bucket. | ||
let _allocator_guard = LOCK.lock().unwrap_or_else(|e| e.into_inner()); | ||
|
||
let ptr = bucket.load(Ordering::Acquire); | ||
|
||
// OK, now under the allocator lock, if we're still null then it's definitely us that will | ||
// initialize this bucket. | ||
if ptr.is_null() { | ||
let bucket_layout = | ||
std::alloc::Layout::array::<Slot<V>>(self.entries as usize).unwrap(); | ||
// This is more of a sanity check -- this code is very cold, so it's safe to pay a | ||
// little extra cost here. | ||
assert!(bucket_layout.size() > 0); | ||
// SAFETY: Just checked that size is non-zero. | ||
let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::<Slot<V>>() }; | ||
if allocated.is_null() { | ||
std::alloc::handle_alloc_error(bucket_layout); | ||
} | ||
bucket.store(allocated, Ordering::Release); | ||
allocated | ||
} else { | ||
// Otherwise some other thread initialized this bucket after we took the lock. In that | ||
// case, just return early. | ||
ptr | ||
} | ||
} | ||
|
||
/// Returns true if this successfully put into the map. | ||
#[inline] | ||
fn put<V>(&self, buckets: &[AtomicPtr<Slot<V>>; 21], value: V, extra: u32) -> bool { | ||
// SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., | ||
// in-bounds of buckets. | ||
let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; | ||
let ptr = self.bucket_ptr(bucket); | ||
|
||
assert!(self.index_in_bucket < self.entries); | ||
// SAFETY: `bucket` was allocated (so <= isize in total bytes) to hold `entries`, so this | ||
// must be inbounds. | ||
let slot = unsafe { ptr.add(self.index_in_bucket) }; | ||
|
||
// SAFETY: initialized bucket has zeroed all memory within the bucket, so we are valid for | ||
// AtomicU32 access. | ||
let index_and_lock = unsafe { &(*slot).index_and_lock }; | ||
match index_and_lock.compare_exchange(0, 1, Ordering::AcqRel, Ordering::Acquire) { | ||
Ok(_) => { | ||
// We have acquired the initialization lock. It is our job to write `value` and | ||
// then set the lock to the real index. | ||
|
||
unsafe { | ||
(&raw mut (*slot).value).write(value); | ||
} | ||
|
||
index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release); | ||
|
||
true | ||
} | ||
|
||
// Treat "initializing" as the caller's fault. Callers are responsible for ensuring that | ||
// there are no races on initialization. In the compiler's current usage for query | ||
// caches, that's the "active query map" which ensures each query actually runs once | ||
// (even if concurrently started). | ||
Err(1) => panic!("caller raced calls to put()"), | ||
|
||
// This slot was already populated. Also ignore, currently this is the same as | ||
// "initializing". | ||
Err(_) => false, | ||
} | ||
} | ||
} | ||
|
||
pub struct VecCache<K: Idx, V, I> { | ||
// Entries per bucket: | ||
// Bucket 0: 4096 2^12 | ||
// Bucket 1: 4096 2^12 | ||
// Bucket 2: 8192 | ||
// Bucket 3: 16384 | ||
// ... | ||
// Bucket 19: 1073741824 | ||
// Bucket 20: 2147483648 | ||
// The total number of entries if all buckets are initialized is u32::MAX-1. | ||
buckets: [AtomicPtr<Slot<V>>; 21], | ||
|
||
// In the compiler's current usage these are only *read* during incremental and self-profiling. | ||
// They are an optimization over iterating the full buckets array. | ||
present: [AtomicPtr<Slot<()>>; 21], | ||
len: AtomicUsize, | ||
|
||
key: PhantomData<(K, I)>, | ||
} | ||
|
||
impl<K: Idx, V, I> Default for VecCache<K, V, I> { | ||
fn default() -> Self { | ||
VecCache { | ||
buckets: Default::default(), | ||
key: PhantomData, | ||
len: Default::default(), | ||
present: Default::default(), | ||
} | ||
} | ||
} | ||
|
||
// SAFETY: No access to `V` is made. | ||
unsafe impl<K: Idx, #[may_dangle] V, I> Drop for VecCache<K, V, I> { | ||
fn drop(&mut self) { | ||
// We have unique ownership, so no locks etc. are needed. Since `K` and `V` are both `Copy`, | ||
// we are also guaranteed to just need to deallocate any large arrays (not iterate over | ||
// contents). | ||
// | ||
// Confirm no need to deallocate invidual entries. Note that `V: Copy` is asserted on | ||
// insert/lookup but not necessarily construction, primarily to avoid annoyingly propagating | ||
// the bounds into struct definitions everywhere. | ||
assert!(!std::mem::needs_drop::<K>()); | ||
assert!(!std::mem::needs_drop::<V>()); | ||
|
||
for (idx, bucket) in self.buckets.iter().enumerate() { | ||
let bucket = bucket.load(Ordering::Acquire); | ||
if !bucket.is_null() { | ||
let layout = std::alloc::Layout::array::<Slot<V>>(ENTRIES_BY_BUCKET[idx]).unwrap(); | ||
unsafe { | ||
std::alloc::dealloc(bucket.cast(), layout); | ||
} | ||
} | ||
} | ||
|
||
for (idx, bucket) in self.present.iter().enumerate() { | ||
let bucket = bucket.load(Ordering::Acquire); | ||
if !bucket.is_null() { | ||
let layout = std::alloc::Layout::array::<Slot<()>>(ENTRIES_BY_BUCKET[idx]).unwrap(); | ||
unsafe { | ||
std::alloc::dealloc(bucket.cast(), layout); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
impl<K, V, I> VecCache<K, V, I> | ||
where | ||
K: Eq + Idx + Copy + Debug, | ||
V: Copy, | ||
I: Idx + Copy, | ||
{ | ||
#[inline(always)] | ||
pub fn lookup(&self, key: &K) -> Option<(V, I)> { | ||
let key = u32::try_from(key.index()).unwrap(); | ||
let slot_idx = SlotIndex::from_index(key); | ||
match unsafe { slot_idx.get(&self.buckets) } { | ||
Some((value, idx)) => Some((value, I::new(idx as usize))), | ||
None => None, | ||
} | ||
} | ||
|
||
#[inline] | ||
pub fn complete(&self, key: K, value: V, index: I) { | ||
let key = u32::try_from(key.index()).unwrap(); | ||
let slot_idx = SlotIndex::from_index(key); | ||
if slot_idx.put(&self.buckets, value, index.index() as u32) { | ||
let present_idx = self.len.fetch_add(1, Ordering::Relaxed); | ||
let slot = SlotIndex::from_index(present_idx as u32); | ||
// We should always be uniquely putting due to `len` fetch_add returning unique values. | ||
assert!(slot.put(&self.present, (), key)); | ||
} | ||
} | ||
|
||
pub fn iter(&self, f: &mut dyn FnMut(&K, &V, I)) { | ||
for idx in 0..self.len.load(Ordering::Acquire) { | ||
let key = SlotIndex::from_index(idx as u32); | ||
match unsafe { key.get(&self.present) } { | ||
// This shouldn't happen in our current usage (iter is really only | ||
// used long after queries are done running), but if we hit this in practice it's | ||
// probably fine to just break early. | ||
None => unreachable!(), | ||
Some(((), key)) => { | ||
let key = K::new(key as usize); | ||
// unwrap() is OK: present entries are always written only after we put the real | ||
// entry. | ||
let value = self.lookup(&key).unwrap(); | ||
f(&key, &value.0, value.1); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests; |
Oops, something went wrong.