From 37c31fb9ce0db53ce55beb273c4142aa97c98737 Mon Sep 17 00:00:00 2001 From: DongSheng He Date: Wed, 26 Apr 2023 15:45:10 +0800 Subject: [PATCH] fix compiler optimize thread local variable access (#2156) * fix compiler optimize thread local variable access * change __thread to BAIDU_THREAD_LOCAL * Add NOT_ALLOW_OPTIMIZE_THREAD_LOCAL_ACCESS according to compiler and arch * move thread local access optimization condition to thread_local.h --- src/bthread/task_group.cpp | 19 +++++++------------ src/butil/thread_local.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 6f5a4abdc7..469e1b0b8d 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -57,7 +57,7 @@ const bool ALLOW_UNUSED dummy_show_per_worker_usage_in_vars = ::GFLAGS_NS::RegisterFlagValidator(&FLAGS_show_per_worker_usage_in_vars, pass_bool); -__thread TaskGroup* tls_task_group = NULL; +BAIDU_VOLATILE_THREAD_LOCAL(TaskGroup*, tls_task_group, NULL); // Sync with TaskMeta::local_storage when a bthread is created or destroyed. // During running, the two fields may be inconsistent, use tls_bls as the // groundtruth. @@ -68,7 +68,7 @@ extern void return_keytable(bthread_keytable_pool_t*, KeyTable*); // [Hacky] This is a special TLS set by bthread-rpc privately... to save // overhead of creation keytable, may be removed later. -BAIDU_THREAD_LOCAL void* tls_unique_user_ptr = NULL; +BAIDU_VOLATILE_THREAD_LOCAL(void*, tls_unique_user_ptr, NULL); const TaskStatistics EMPTY_STAT = { 0, 0 }; @@ -248,9 +248,6 @@ int TaskGroup::init(size_t runqueue_capacity) { return 0; } -#if defined(__linux__) && defined(__aarch64__) && defined(__clang__) - __attribute__((optnone)) -#endif void TaskGroup::task_runner(intptr_t skip_remained) { // NOTE: tls_task_group is volatile since tasks are moved around // different groups. @@ -301,7 +298,7 @@ void TaskGroup::task_runner(intptr_t skip_remained) { } // Group is probably changed - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); // TODO: Save thread_return (void)thread_return; @@ -570,9 +567,6 @@ void TaskGroup::sched(TaskGroup** pg) { sched_to(pg, next_tid); } -#if defined(__linux__) && defined(__aarch64__) && defined(__clang__) - __attribute__((optnone)) -#endif void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { TaskGroup* g = *pg; #ifndef NDEBUG @@ -614,7 +608,7 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { if (next_meta->stack != cur_meta->stack) { jump_stack(cur_meta->stack, next_meta->stack); // probably went to another group, need to assign g again. - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); } #ifndef NDEBUG else { @@ -633,12 +627,13 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { RemainedFn fn = g->_last_context_remained; g->_last_context_remained = NULL; fn(g->_last_context_remained_arg); - g = tls_task_group; + g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); } // Restore errno errno = saved_errno; - tls_unique_user_ptr = saved_unique_user_ptr; + // tls_unique_user_ptr probably changed. + BAIDU_SET_VOLATILE_THREAD_LOCAL(tls_unique_user_ptr, saved_unique_user_ptr); #ifndef NDEBUG --g->_sched_recursive_guard; diff --git a/src/butil/thread_local.h b/src/butil/thread_local.h index 1c462b0fff..4f77e95870 100644 --- a/src/butil/thread_local.h +++ b/src/butil/thread_local.h @@ -30,6 +30,35 @@ #define BAIDU_THREAD_LOCAL __thread #endif // _MSC_VER +#define BAIDU_VOLATILE_THREAD_LOCAL(type, var_name, default_value) \ + BAIDU_THREAD_LOCAL type var_name = default_value; \ + static __attribute__((noinline, unused)) type get_##var_name(void) { \ + asm volatile(""); \ + return var_name; \ + } \ + static __attribute__((noinline, unused)) type *get_ptr_##var_name(void) { \ + type *ptr = &var_name; \ + asm volatile("" : "+rm"(ptr)); \ + return ptr; \ + } \ + static __attribute__((noinline, unused)) void set_##var_name(type v) { \ + asm volatile(""); \ + var_name = v; \ + } + +#if defined(__clang__) && (defined(__aarch64__) || defined(__arm64__)) +// Clang compiler is incorrectly caching the address of thread_local variables +// across a suspend-point. The following macros used to disable the volatile +// thread local access optimization. +#define BAIDU_GET_VOLATILE_THREAD_LOCAL(var_name) get_##var_name() +#define BAIDU_GET_PTR_VOLATILE_THREAD_LOCAL(var_name) get_ptr_##var_name() +#define BAIDU_SET_VOLATILE_THREAD_LOCAL(var_name, value) set_##var_name(value) +#else +#define BAIDU_GET_VOLATILE_THREAD_LOCAL(var_name) var_name +#define BAIDU_GET_PTR_VOLATILE_THREAD_LOCAL(var_name) &##var_name +#define BAIDU_SET_VOLATILE_THREAD_LOCAL(var_name, value) var_name = value +#endif + namespace butil { // Get a thread-local object typed T. The object will be default-constructed