Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify TLS key creation on Windows #102103

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
#![feature(doc_notable_trait)]
#![feature(dropck_eyepatch)]
#![feature(exhaustive_patterns)]
#![feature(inline_const)]
#![feature(intra_doc_pointers)]
#![cfg_attr(bootstrap, feature(label_break_value))]
#![feature(lang_items)]
Expand Down
5 changes: 0 additions & 5 deletions library/std/src/sys/sgx/thread_local_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,3 @@ pub unsafe fn get(key: Key) -> *mut u8 {
pub unsafe fn destroy(key: Key) {
Tls::destroy(AbiKey::from_usize(key))
}

#[inline]
pub fn requires_synchronized_create() -> bool {
false
}
5 changes: 0 additions & 5 deletions library/std/src/sys/solid/thread_local_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 {
pub unsafe fn destroy(_key: Key) {
panic!("should not be used on the solid target");
}

#[inline]
pub fn requires_synchronized_create() -> bool {
panic!("should not be used on the solid target");
}
27 changes: 21 additions & 6 deletions library/std/src/sys/unix/thread_local_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,28 @@ pub type Key = libc::pthread_key_t;

#[inline]
pub unsafe fn create(dtor: Option<unsafe extern "C" fn(*mut u8)>) -> Key {
let dtor = mem::transmute(dtor);
let mut key = 0;
assert_eq!(libc::pthread_key_create(&mut key, mem::transmute(dtor)), 0);
let r = libc::pthread_key_create(&mut key, dtor);
assert_eq!(r, 0);

// POSIX allows the key created here to be 0, but `StaticKey` relies
// on using 0 as a sentinel value to check who won the race to set the
// shared TLS key. As far as I know, there is no guaranteed value that
// cannot be returned as a posix_key_create key, so there is no value
// we can initialize the inner key with to prove that it has not yet
// been set. Therefore, we use this small trick to ensure the returned
// key is not zero.
if key == 0 {
let mut new = 0;
// Only check the creation result after deleting the old key to avoid
// leaking it.
let r_c = libc::pthread_key_create(&mut new, dtor);
let r_d = libc::pthread_key_delete(key);
assert_eq!(r_c, 0);
debug_assert_eq!(r_d, 0);
key = new;
}
key
}

Expand All @@ -27,8 +47,3 @@ pub unsafe fn destroy(key: Key) {
let r = libc::pthread_key_delete(key);
debug_assert_eq!(r, 0);
}

#[inline]
pub fn requires_synchronized_create() -> bool {
false
}
5 changes: 0 additions & 5 deletions library/std/src/sys/unsupported/thread_local_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,3 @@ pub unsafe fn get(_key: Key) -> *mut u8 {
pub unsafe fn destroy(_key: Key) {
panic!("should not be used on this target");
}

#[inline]
pub fn requires_synchronized_create() -> bool {
panic!("should not be used on this target");
}
1 change: 1 addition & 0 deletions library/std/src/sys/windows/c.rs
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,7 @@ extern "system" {
pub fn TlsAlloc() -> DWORD;
pub fn TlsGetValue(dwTlsIndex: DWORD) -> LPVOID;
pub fn TlsSetValue(dwTlsIndex: DWORD, lpTlsvalue: LPVOID) -> BOOL;
pub fn TlsFree(dwTlsIndex: DWORD) -> BOOL;
pub fn GetLastError() -> DWORD;
pub fn QueryPerformanceFrequency(lpFrequency: *mut LARGE_INTEGER) -> BOOL;
pub fn QueryPerformanceCounter(lpPerformanceCount: *mut LARGE_INTEGER) -> BOOL;
Expand Down
176 changes: 66 additions & 110 deletions library/std/src/sys/windows/thread_local_key.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use crate::mem::ManuallyDrop;
use crate::mem;
use crate::ptr;
use crate::sync::atomic::AtomicPtr;
use crate::sync::atomic::Ordering::SeqCst;
use crate::sync::atomic::{
compiler_fence, AtomicPtr, AtomicUsize,
Ordering::{Relaxed, Release},
};
use crate::sys::c;

pub type Key = c::DWORD;
Expand All @@ -19,106 +21,93 @@ pub type Dtor = unsafe extern "C" fn(*mut u8);
// somewhere to run arbitrary code on thread termination. With this in place
// we'll be able to run anything we like, including all TLS destructors!
//
// To accomplish this feat, we perform a number of threads, all contained
// within this module:
//
// * All TLS destructors are tracked by *us*, not the windows runtime. This
// means that we have a global list of destructors for each TLS key that
// we know about.
// * When a thread exits, we run over the entire list and run dtors for all
// non-null keys. This attempts to match Unix semantics in this regard.
//
// This ends up having the overhead of using a global list, having some
// locks here and there, and in general just adding some more code bloat. We
// attempt to optimize runtime by forgetting keys that don't have
// destructors, but this only gets us so far.
// Since the maximum number of keys is 1088 [3] and key values are always lower
// than 1088 [4], we can just use a static array to store the destructor functions
// and use the TLS key as index. This avoids all synchronization problems
// encountered with linked lists or other kinds of storage.
//
// For more details and nitty-gritty, see the code sections below!
//
// [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base
// /threading/thread_local_storage_win.cc#L42
// [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42
// [3]: https://learn.microsoft.com/en-us/windows/win32/procthread/thread-local-storage
// [4]: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-tlssetvalue

// -------------------------------------------------------------------------
// Native bindings
//
// This section is just raw bindings to the native functions that Windows
// provides, There's a few extra calls to deal with destructors.
static DTORS: [AtomicPtr<()>; 1088] = [const { AtomicPtr::new(ptr::null_mut()) }; 1088];
// The highest key that has a destructor associated with it. Used as an
// optimization so we don't need to iterate over the whole array when there
// are only a few keys.
static HIGHEST: AtomicUsize = AtomicUsize::new(0);

#[inline]
pub unsafe fn create(dtor: Option<Dtor>) -> Key {
let key = c::TlsAlloc();
assert!(key != c::TLS_OUT_OF_INDEXES);
if let Some(f) = dtor {
register_dtor(key, f);

if let Some(dtor) = dtor {
DTORS[key as usize].store(mem::transmute::<Dtor, *mut ()>(dtor), Relaxed);
HIGHEST.fetch_max(key as usize, Relaxed);
// If the destructors are run in a signal handler running after this
// code, we need to guarantee that the changes have been performed
// before the handler is triggered.
compiler_fence(Release);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This micro-optimization doesn't really feel likely to be worth it, and it's plausibly not even totally sound.

We should just use the correct orderings for these atomics -- it really shouldn't make a performance difference and is easier to reason about.

}
key

// Ensure that the key is always non-null. Since key values are below
// 1088, this cannot overflow.
key + 1
}

#[inline]
pub unsafe fn set(key: Key, value: *mut u8) {
let r = c::TlsSetValue(key, value as c::LPVOID);
let r = c::TlsSetValue(key - 1, value as c::LPVOID);
debug_assert!(r != 0);
}

#[inline]
pub unsafe fn get(key: Key) -> *mut u8 {
c::TlsGetValue(key) as *mut u8
}

#[inline]
pub unsafe fn destroy(_key: Key) {
rtabort!("can't destroy tls keys on windows")
c::TlsGetValue(key - 1) as *mut u8
}

#[inline]
pub fn requires_synchronized_create() -> bool {
true
}

// -------------------------------------------------------------------------
// Dtor registration
//
// Windows has no native support for running destructors so we manage our own
// list of destructors to keep track of how to destroy keys. We then install a
// callback later to get invoked whenever a thread exits, running all
// appropriate destructors.
//
// Currently unregistration from this list is not supported. A destructor can be
// registered but cannot be unregistered. There's various simplifying reasons
// for doing this, the big ones being:
//
// 1. Currently we don't even support deallocating TLS keys, so normal operation
// doesn't need to deallocate a destructor.
// 2. There is no point in time where we know we can unregister a destructor
// because it could always be getting run by some remote thread.
//
// Typically processes have a statically known set of TLS keys which is pretty
// small, and we'd want to keep this memory alive for the whole process anyway
// really.
//
// Perhaps one day we can fold the `Box` here into a static allocation,
// expanding the `StaticKey` structure to contain not only a slot for the TLS
// key but also a slot for the destructor queue on windows. An optimization for
// another day!

static DTORS: AtomicPtr<Node> = AtomicPtr::new(ptr::null_mut());

struct Node {
dtor: Dtor,
key: Key,
next: *mut Node,
pub unsafe fn destroy(key: Key) {
DTORS[(key - 1) as usize].store(ptr::null_mut(), Relaxed);
let r = c::TlsFree(key - 1);
// Use release ordering for the same reason as above.
compiler_fence(Release);
debug_assert!(r != 0);
}

unsafe fn register_dtor(key: Key, dtor: Dtor) {
let mut node = ManuallyDrop::new(Box::new(Node { key, dtor, next: ptr::null_mut() }));
#[allow(dead_code)] // actually called below
unsafe fn run_dtors() {
let mut iterations = 5;
while iterations != 0 {
let mut any_run = false;
// All keys have either been created by the current thread or must
// have been propagated through other means of synchronization, so
// we can just use relaxed ordering here and still observe all
// changes relevant to us.
let highest = HIGHEST.load(Relaxed);
for (index, dtor) in DTORS[..highest].iter().enumerate() {
let dtor = mem::transmute::<*mut (), Option<Dtor>>(dtor.load(Relaxed));
if let Some(dtor) = dtor {
let ptr = c::TlsGetValue(index as Key) as *mut u8;
if !ptr.is_null() {
let r = c::TlsSetValue(index as Key, ptr::null_mut());
debug_assert!(r != 0);

(dtor)(ptr);
any_run = true;
}
}
}

let mut head = DTORS.load(SeqCst);
loop {
node.next = head;
match DTORS.compare_exchange(head, &mut **node, SeqCst, SeqCst) {
Ok(_) => return, // nothing to drop, we successfully added the node to the list
Err(cur) => head = cur,
iterations -= 1;
// If no destructors where run, no new keys have been initialized,
// so we are done. FIXME: Maybe use TLS to store the number of active
// keys per thread.
if !any_run {
return;
}
}
}
Expand Down Expand Up @@ -154,16 +143,6 @@ unsafe fn register_dtor(key: Key, dtor: Dtor) {
// thread or a process "detaches" (exits). The process part happens for the
// last thread and the thread part happens for any normal thread.
//
// # Ok, what's up with running all these destructors?
//
// This will likely need to be improved over time, but this function
// attempts a "poor man's" destructor callback system. Once we've got a list
// of what to run, we iterate over all keys, check their values, and then run
// destructors if the values turn out to be non null (setting them to null just
// beforehand). We do this a few times in a loop to basically match Unix
// semantics. If we don't reach a fixed point after a short while then we just
// inevitably leak something most likely.
//
// # The article mentions weird stuff about "/INCLUDE"?
//
// It sure does! Specifically we're talking about this quote:
Expand Down Expand Up @@ -213,26 +192,3 @@ unsafe extern "system" fn on_tls_callback(h: c::LPVOID, dwReason: c::DWORD, pv:
#[cfg(not(target_env = "msvc"))]
unsafe fn reference_tls_used() {}
}

#[allow(dead_code)] // actually called above
unsafe fn run_dtors() {
let mut any_run = true;
for _ in 0..5 {
if !any_run {
break;
}
any_run = false;
let mut cur = DTORS.load(SeqCst);
while !cur.is_null() {
let ptr = c::TlsGetValue((*cur).key);

if !ptr.is_null() {
c::TlsSetValue((*cur).key, ptr::null_mut());
((*cur).dtor)(ptr as *mut _);
any_run = true;
}

cur = (*cur).next;
}
}
}
46 changes: 4 additions & 42 deletions library/std/src/sys_common/thread_local_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ mod tests;

use crate::sync::atomic::{self, AtomicUsize, Ordering};
use crate::sys::thread_local_key as imp;
use crate::sys_common::mutex::StaticMutex;

/// A type for TLS keys that are statically allocated.
///
Expand Down Expand Up @@ -151,44 +150,9 @@ impl StaticKey {
}

unsafe fn lazy_init(&self) -> usize {
// Currently the Windows implementation of TLS is pretty hairy, and
// it greatly simplifies creation if we just synchronize everything.
//
// Additionally a 0-index of a tls key hasn't been seen on windows, so
// we just simplify the whole branch.
if imp::requires_synchronized_create() {
// We never call `INIT_LOCK.init()`, so it is UB to attempt to
// acquire this mutex reentrantly!
static INIT_LOCK: StaticMutex = StaticMutex::new();
let _guard = INIT_LOCK.lock();
let mut key = self.key.load(Ordering::SeqCst);
if key == 0 {
key = imp::create(self.dtor) as usize;
self.key.store(key, Ordering::SeqCst);
}
rtassert!(key != 0);
return key;
}

// POSIX allows the key created here to be 0, but the compare_exchange
// below relies on using 0 as a sentinel value to check who won the
// race to set the shared TLS key. As far as I know, there is no
// guaranteed value that cannot be returned as a posix_key_create key,
// so there is no value we can initialize the inner key with to
// prove that it has not yet been set. As such, we'll continue using a
// value of 0, but with some gyrations to make sure we have a non-0
// value returned from the creation routine.
// FIXME: this is clearly a hack, and should be cleaned up.
let key1 = imp::create(self.dtor);
let key = if key1 != 0 {
key1
} else {
let key2 = imp::create(self.dtor);
imp::destroy(key1);
key2
};
rtassert!(key != 0);
match self.key.compare_exchange(0, key as usize, Ordering::SeqCst, Ordering::SeqCst) {
let key = imp::create(self.dtor);
debug_assert!(key != 0);
match self.key.compare_exchange(0, key as usize, Ordering::AcqRel, Ordering::Acquire) {
// The CAS succeeded, so we've created the actual key
Ok(_) => key as usize,
// If someone beat us to the punch, use their key instead
Expand Down Expand Up @@ -232,8 +196,6 @@ impl Key {

impl Drop for Key {
fn drop(&mut self) {
// Right now Windows doesn't support TLS key destruction, but this also
// isn't used anywhere other than tests, so just leak the TLS key.
// unsafe { imp::destroy(self.key) }
unsafe { imp::destroy(self.key) }
}
}