From ecca8ce710c1076af5178566462e819dd4261c9d Mon Sep 17 00:00:00 2001 From: joboet Date: Wed, 21 Sep 2022 16:44:27 +0200 Subject: [PATCH] std: simplify TLS key creation on Windows --- library/std/src/lib.rs | 1 + library/std/src/sys/sgx/thread_local_key.rs | 5 - library/std/src/sys/solid/thread_local_key.rs | 5 - library/std/src/sys/unix/thread_local_key.rs | 27 ++- .../src/sys/unsupported/thread_local_key.rs | 5 - library/std/src/sys/windows/c.rs | 1 + .../std/src/sys/windows/thread_local_key.rs | 176 +++++++----------- .../std/src/sys_common/thread_local_key.rs | 46 +---- 8 files changed, 93 insertions(+), 173 deletions(-) diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index f13500de01431..592ddd8494ba1 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -251,6 +251,7 @@ #![feature(doc_notable_trait)] #![feature(dropck_eyepatch)] #![feature(exhaustive_patterns)] +#![feature(inline_const)] #![feature(intra_doc_pointers)] #![cfg_attr(bootstrap, feature(label_break_value))] #![feature(lang_items)] diff --git a/library/std/src/sys/sgx/thread_local_key.rs b/library/std/src/sys/sgx/thread_local_key.rs index b21784475f0d2..c7a57d3a3d47e 100644 --- a/library/std/src/sys/sgx/thread_local_key.rs +++ b/library/std/src/sys/sgx/thread_local_key.rs @@ -21,8 +21,3 @@ pub unsafe fn get(key: Key) -> *mut u8 { pub unsafe fn destroy(key: Key) { Tls::destroy(AbiKey::from_usize(key)) } - -#[inline] -pub fn requires_synchronized_create() -> bool { - false -} diff --git a/library/std/src/sys/solid/thread_local_key.rs b/library/std/src/sys/solid/thread_local_key.rs index b17521f701daf..b37bf99969887 100644 --- a/library/std/src/sys/solid/thread_local_key.rs +++ b/library/std/src/sys/solid/thread_local_key.rs @@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 { pub unsafe fn destroy(_key: Key) { panic!("should not be used on the solid target"); } - -#[inline] -pub fn requires_synchronized_create() -> bool { - panic!("should not be used on the solid target"); -} diff --git a/library/std/src/sys/unix/thread_local_key.rs b/library/std/src/sys/unix/thread_local_key.rs index 2c5b94b1e61e5..bfe78bc529651 100644 --- a/library/std/src/sys/unix/thread_local_key.rs +++ b/library/std/src/sys/unix/thread_local_key.rs @@ -6,8 +6,28 @@ pub type Key = libc::pthread_key_t; #[inline] pub unsafe fn create(dtor: Option) -> Key { + let dtor = mem::transmute(dtor); let mut key = 0; - assert_eq!(libc::pthread_key_create(&mut key, mem::transmute(dtor)), 0); + let r = libc::pthread_key_create(&mut key, dtor); + assert_eq!(r, 0); + + // POSIX allows the key created here to be 0, but `StaticKey` relies + // on using 0 as a sentinel value to check who won the race to set the + // shared TLS key. As far as I know, there is no guaranteed value that + // cannot be returned as a posix_key_create key, so there is no value + // we can initialize the inner key with to prove that it has not yet + // been set. Therefore, we use this small trick to ensure the returned + // key is not zero. + if key == 0 { + let mut new = 0; + // Only check the creation result after deleting the old key to avoid + // leaking it. + let r_c = libc::pthread_key_create(&mut new, dtor); + let r_d = libc::pthread_key_delete(key); + assert_eq!(r_c, 0); + debug_assert_eq!(r_d, 0); + key = new; + } key } @@ -27,8 +47,3 @@ pub unsafe fn destroy(key: Key) { let r = libc::pthread_key_delete(key); debug_assert_eq!(r, 0); } - -#[inline] -pub fn requires_synchronized_create() -> bool { - false -} diff --git a/library/std/src/sys/unsupported/thread_local_key.rs b/library/std/src/sys/unsupported/thread_local_key.rs index c31b61cbf56d3..b6e5e4cd2e197 100644 --- a/library/std/src/sys/unsupported/thread_local_key.rs +++ b/library/std/src/sys/unsupported/thread_local_key.rs @@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 { pub unsafe fn destroy(_key: Key) { panic!("should not be used on this target"); } - -#[inline] -pub fn requires_synchronized_create() -> bool { - panic!("should not be used on this target"); -} diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs index 89d0ab59be89f..920d9d187bbc3 100644 --- a/library/std/src/sys/windows/c.rs +++ b/library/std/src/sys/windows/c.rs @@ -959,6 +959,7 @@ extern "system" { pub fn TlsAlloc() -> DWORD; pub fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID; pub fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL; + pub fn TlsFree(dwTlsIndex: DWORD) -> BOOL; pub fn GetLastError() -> DWORD; pub fn QueryPerformanceFrequency(lpFrequency: *mut LARGE_INTEGER) -> BOOL; pub fn QueryPerformanceCounter(lpPerformanceCount: *mut LARGE_INTEGER) -> BOOL; diff --git a/library/std/src/sys/windows/thread_local_key.rs b/library/std/src/sys/windows/thread_local_key.rs index ec670238e6f0e..4a3820bdfe60d 100644 --- a/library/std/src/sys/windows/thread_local_key.rs +++ b/library/std/src/sys/windows/thread_local_key.rs @@ -1,7 +1,9 @@ -use crate::mem::ManuallyDrop; +use crate::mem; use crate::ptr; -use crate::sync::atomic::AtomicPtr; -use crate::sync::atomic::Ordering::SeqCst; +use crate::sync::atomic::{ + compiler_fence, AtomicPtr, AtomicUsize, + Ordering::{Relaxed, Release}, +}; use crate::sys::c; pub type Key = c::DWORD; @@ -19,106 +21,93 @@ pub type Dtor = unsafe extern "C" fn(*mut u8); // somewhere to run arbitrary code on thread termination. With this in place // we'll be able to run anything we like, including all TLS destructors! // -// To accomplish this feat, we perform a number of threads, all contained -// within this module: -// -// * All TLS destructors are tracked by *us*, not the windows runtime. This -// means that we have a global list of destructors for each TLS key that -// we know about. -// * When a thread exits, we run over the entire list and run dtors for all -// non-null keys. This attempts to match Unix semantics in this regard. -// -// This ends up having the overhead of using a global list, having some -// locks here and there, and in general just adding some more code bloat. We -// attempt to optimize runtime by forgetting keys that don't have -// destructors, but this only gets us so far. +// Since the maximum number of keys is 1088 [3] and key values are always lower +// than 1088 [4], we can just use a static array to store the destructor functions +// and use the TLS key as index. This avoids all synchronization problems +// encountered with linked lists or other kinds of storage. // // For more details and nitty-gritty, see the code sections below! // // [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way -// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base -// /threading/thread_local_storage_win.cc#L42 +// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42 +// [3]: https://learn.microsoft.com/en-us/windows/win32/procthread/thread-local-storage +// [4]: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-tlssetvalue -// ------------------------------------------------------------------------- -// Native bindings -// -// This section is just raw bindings to the native functions that Windows -// provides, There's a few extra calls to deal with destructors. +static DTORS: [AtomicPtr<()>; 1088] = [const { AtomicPtr::new(ptr::null_mut()) }; 1088]; +// The highest key that has a destructor associated with it. Used as an +// optimization so we don't need to iterate over the whole array when there +// are only a few keys. +static HIGHEST: AtomicUsize = AtomicUsize::new(0); #[inline] pub unsafe fn create(dtor: Option) -> Key { let key = c::TlsAlloc(); assert!(key != c::TLS_OUT_OF_INDEXES); - if let Some(f) = dtor { - register_dtor(key, f); + + if let Some(dtor) = dtor { + DTORS[key as usize].store(mem::transmute::(dtor), Relaxed); + HIGHEST.fetch_max(key as usize, Relaxed); + // If the destructors are run in a signal handler running after this + // code, we need to guarantee that the changes have been performed + // before the handler is triggered. + compiler_fence(Release); } - key + + // Ensure that the key is always non-null. Since key values are below + // 1088, this cannot overflow. + key + 1 } #[inline] pub unsafe fn set(key: Key, value: *mut u8) { - let r = c::TlsSetValue(key, value as c::LPVOID); + let r = c::TlsSetValue(key - 1, value as c::LPVOID); debug_assert!(r != 0); } #[inline] pub unsafe fn get(key: Key) -> *mut u8 { - c::TlsGetValue(key) as *mut u8 -} - -#[inline] -pub unsafe fn destroy(_key: Key) { - rtabort!("can't destroy tls keys on windows") + c::TlsGetValue(key - 1) as *mut u8 } #[inline] -pub fn requires_synchronized_create() -> bool { - true -} - -// ------------------------------------------------------------------------- -// Dtor registration -// -// Windows has no native support for running destructors so we manage our own -// list of destructors to keep track of how to destroy keys. We then install a -// callback later to get invoked whenever a thread exits, running all -// appropriate destructors. -// -// Currently unregistration from this list is not supported. A destructor can be -// registered but cannot be unregistered. There's various simplifying reasons -// for doing this, the big ones being: -// -// 1. Currently we don't even support deallocating TLS keys, so normal operation -// doesn't need to deallocate a destructor. -// 2. There is no point in time where we know we can unregister a destructor -// because it could always be getting run by some remote thread. -// -// Typically processes have a statically known set of TLS keys which is pretty -// small, and we'd want to keep this memory alive for the whole process anyway -// really. -// -// Perhaps one day we can fold the `Box` here into a static allocation, -// expanding the `StaticKey` structure to contain not only a slot for the TLS -// key but also a slot for the destructor queue on windows. An optimization for -// another day! - -static DTORS: AtomicPtr = AtomicPtr::new(ptr::null_mut()); - -struct Node { - dtor: Dtor, - key: Key, - next: *mut Node, +pub unsafe fn destroy(key: Key) { + DTORS[(key - 1) as usize].store(ptr::null_mut(), Relaxed); + let r = c::TlsFree(key - 1); + // Use release ordering for the same reason as above. + compiler_fence(Release); + debug_assert!(r != 0); } -unsafe fn register_dtor(key: Key, dtor: Dtor) { - let mut node = ManuallyDrop::new(Box::new(Node { key, dtor, next: ptr::null_mut() })); +#[allow(dead_code)] // actually called below +unsafe fn run_dtors() { + let mut iterations = 5; + while iterations != 0 { + let mut any_run = false; + // All keys have either been created by the current thread or must + // have been propagated through other means of synchronization, so + // we can just use relaxed ordering here and still observe all + // changes relevant to us. + let highest = HIGHEST.load(Relaxed); + for (index, dtor) in DTORS[..highest].iter().enumerate() { + let dtor = mem::transmute::<*mut (), Option>(dtor.load(Relaxed)); + if let Some(dtor) = dtor { + let ptr = c::TlsGetValue(index as Key) as *mut u8; + if !ptr.is_null() { + let r = c::TlsSetValue(index as Key, ptr::null_mut()); + debug_assert!(r != 0); + + (dtor)(ptr); + any_run = true; + } + } + } - let mut head = DTORS.load(SeqCst); - loop { - node.next = head; - match DTORS.compare_exchange(head, &mut **node, SeqCst, SeqCst) { - Ok(_) => return, // nothing to drop, we successfully added the node to the list - Err(cur) => head = cur, + iterations -= 1; + // If no destructors where run, no new keys have been initialized, + // so we are done. FIXME: Maybe use TLS to store the number of active + // keys per thread. + if !any_run { + return; } } } @@ -154,16 +143,6 @@ unsafe fn register_dtor(key: Key, dtor: Dtor) { // thread or a process "detaches" (exits). The process part happens for the // last thread and the thread part happens for any normal thread. // -// # Ok, what's up with running all these destructors? -// -// This will likely need to be improved over time, but this function -// attempts a "poor man's" destructor callback system. Once we've got a list -// of what to run, we iterate over all keys, check their values, and then run -// destructors if the values turn out to be non null (setting them to null just -// beforehand). We do this a few times in a loop to basically match Unix -// semantics. If we don't reach a fixed point after a short while then we just -// inevitably leak something most likely. -// // # The article mentions weird stuff about "/INCLUDE"? // // It sure does! Specifically we're talking about this quote: @@ -213,26 +192,3 @@ unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv: #[cfg(not(target_env = "msvc"))] unsafe fn reference_tls_used() {} } - -#[allow(dead_code)] // actually called above -unsafe fn run_dtors() { - let mut any_run = true; - for _ in 0..5 { - if !any_run { - break; - } - any_run = false; - let mut cur = DTORS.load(SeqCst); - while !cur.is_null() { - let ptr = c::TlsGetValue((*cur).key); - - if !ptr.is_null() { - c::TlsSetValue((*cur).key, ptr::null_mut()); - ((*cur).dtor)(ptr as *mut _); - any_run = true; - } - - cur = (*cur).next; - } - } -} diff --git a/library/std/src/sys_common/thread_local_key.rs b/library/std/src/sys_common/thread_local_key.rs index 032bf604d7388..ad8d36d086ccf 100644 --- a/library/std/src/sys_common/thread_local_key.rs +++ b/library/std/src/sys_common/thread_local_key.rs @@ -53,7 +53,6 @@ mod tests; use crate::sync::atomic::{self, AtomicUsize, Ordering}; use crate::sys::thread_local_key as imp; -use crate::sys_common::mutex::StaticMutex; /// A type for TLS keys that are statically allocated. /// @@ -151,44 +150,9 @@ impl StaticKey { } unsafe fn lazy_init(&self) -> usize { - // Currently the Windows implementation of TLS is pretty hairy, and - // it greatly simplifies creation if we just synchronize everything. - // - // Additionally a 0-index of a tls key hasn't been seen on windows, so - // we just simplify the whole branch. - if imp::requires_synchronized_create() { - // We never call `INIT_LOCK.init()`, so it is UB to attempt to - // acquire this mutex reentrantly! - static INIT_LOCK: StaticMutex = StaticMutex::new(); - let _guard = INIT_LOCK.lock(); - let mut key = self.key.load(Ordering::SeqCst); - if key == 0 { - key = imp::create(self.dtor) as usize; - self.key.store(key, Ordering::SeqCst); - } - rtassert!(key != 0); - return key; - } - - // POSIX allows the key created here to be 0, but the compare_exchange - // below relies on using 0 as a sentinel value to check who won the - // race to set the shared TLS key. As far as I know, there is no - // guaranteed value that cannot be returned as a posix_key_create key, - // so there is no value we can initialize the inner key with to - // prove that it has not yet been set. As such, we'll continue using a - // value of 0, but with some gyrations to make sure we have a non-0 - // value returned from the creation routine. - // FIXME: this is clearly a hack, and should be cleaned up. - let key1 = imp::create(self.dtor); - let key = if key1 != 0 { - key1 - } else { - let key2 = imp::create(self.dtor); - imp::destroy(key1); - key2 - }; - rtassert!(key != 0); - match self.key.compare_exchange(0, key as usize, Ordering::SeqCst, Ordering::SeqCst) { + let key = imp::create(self.dtor); + debug_assert!(key != 0); + match self.key.compare_exchange(0, key as usize, Ordering::AcqRel, Ordering::Acquire) { // The CAS succeeded, so we've created the actual key Ok(_) => key as usize, // If someone beat us to the punch, use their key instead @@ -232,8 +196,6 @@ impl Key { impl Drop for Key { fn drop(&mut self) { - // Right now Windows doesn't support TLS key destruction, but this also - // isn't used anywhere other than tests, so just leak the TLS key. - // unsafe { imp::destroy(self.key) } + unsafe { imp::destroy(self.key) } } }