From 64b33747062c585df9f0eb810c28d367379a9998 Mon Sep 17 00:00:00 2001 From: Nick Ripley Date: Tue, 29 Oct 2024 12:37:40 -0400 Subject: [PATCH] fix(profiling): fix data race when accessing span for thread (#11167) The ThreadSpanLinks singleton holds the active span (if one exists) for a given thread ID. The `get_active_span_from_thread_id` member function returns a pointer to the active span for a thread. The `link_span` member function sets the active span for a thread. `get_active_span_from_thread_id` accesses the map of spans under a mutex, but returns the pointer after releasing the mutex, meaning `link_span` can modify the members of the Span while the caller of `get_active_span_from_thread_id` is reading them. Fix this by returning a copy of the `Span`. Use a `std::optional` to wrap the return value of `get_active_span_from_thread_id`, rather than returning a pointer. We want to tell whether or not there actually was a span associated with the thread, but returning a pointer would require us to heap allocate the copy of the Span. I added a simplistic regression test which fails reliably without this fix when built with the thread sanitizer enabled. Output like: ``` WARNING: ThreadSanitizer: data race (pid=2971510) Read of size 8 at 0x7b2000004080 by thread T2: #0 memcpy ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:823 (libtsan.so.0+0x42313) #1 memcpy ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:815 (libtsan.so.0+0x42313) #2 std::__cxx11::basic_string, std::allocator >::_M_assign(std::__cxx11::basic_string, std::allocator > const&) (libstdc++.so.6+0x1432b4) #3 std::__cxx11::basic_string, std::allocator > std::__invoke_impl, std::allocator >, std::__cxx11::basic_string, std::allocator > (*)()>(std::__invoke_other, std::__cxx11::basic_string, std::allocator > (*&&)()) (thread_span_links+0xe46e) #4 std::__invoke_result, std::allocator > (*)()>::type std::__invoke, std::allocator > (*)()>(std::__cxx11::basic_string, std::allocator > (*&&)()) (thread_span_links+0xe2fe) #5 std::__cxx11::basic_string, std::allocator > std::thread::_Invoker, std::allocator > (*)()> >::_M_invoke<0ul>(std::_Index_tuple<0ul>) (thread_span_links+0xe1cf) #6 std::thread::_Invoker, std::allocator > (*)()> >::operator()() (thread_span_links+0xe0f6) #7 std::thread::_State_impl, std::allocator > (*)()> > >::_M_run() (thread_span_links+0xdf40) #8 (libstdc++.so.6+0xd6df3) Previous write of size 8 at 0x7b2000004080 by thread T1 (mutexes: write M47): #0 memcpy ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:823 (libtsan.so.0+0x42313) #1 memcpy ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:815 (libtsan.so.0+0x42313) #2 std::__cxx11::basic_string, std::allocator >::_M_assign(std::__cxx11::basic_string, std::allocato r > const&) (libstdc++.so.6+0x1432b4) #3 get() (thread_span_links+0xb570) #4 void std::__invoke_impl(std::__invoke_other, void (*&&)()) (thread_span_links+0xe525) #5 std::__invoke_result::type std::__invoke(void (*&&)()) (thread_span_links+0xe3b5) #6 void std::thread::_Invoker >::_M_invoke<0ul>(std::_Index_tuple<0ul>) (thread_span_links+0xe242) #7 std::thread::_Invoker >::operator()() (thread_span_links+0xe158) [ ... etc ... ] ``` --- .../datadog/profiling/stack_v2/CMakeLists.txt | 6 +++ .../stack_v2/include/thread_span_links.hpp | 3 +- .../profiling/stack_v2/src/stack_renderer.cpp | 4 +- .../stack_v2/src/thread_span_links.cpp | 11 +++-- .../profiling/stack_v2/test/CMakeLists.txt | 25 ++++++++++ .../stack_v2/test/thread_span_links.cpp | 48 +++++++++++++++++++ ...x-data-race-segfault-b14f59c06c6bf9ed.yaml | 5 ++ 7 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 ddtrace/internal/datadog/profiling/stack_v2/test/CMakeLists.txt create mode 100644 ddtrace/internal/datadog/profiling/stack_v2/test/thread_span_links.cpp create mode 100644 releasenotes/notes/profiling-fix-data-race-segfault-b14f59c06c6bf9ed.yaml diff --git a/ddtrace/internal/datadog/profiling/stack_v2/CMakeLists.txt b/ddtrace/internal/datadog/profiling/stack_v2/CMakeLists.txt index 6c330c9c970..19f3955dab0 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/CMakeLists.txt +++ b/ddtrace/internal/datadog/profiling/stack_v2/CMakeLists.txt @@ -117,3 +117,9 @@ if(LIB_INSTALL_DIR) ARCHIVE DESTINATION ${LIB_INSTALL_DIR} RUNTIME DESTINATION ${LIB_INSTALL_DIR}) endif() + +if(BUILD_TESTING) + enable_testing() + add_subdirectory(test) +endif() + diff --git a/ddtrace/internal/datadog/profiling/stack_v2/include/thread_span_links.hpp b/ddtrace/internal/datadog/profiling/stack_v2/include/thread_span_links.hpp index 5f2f69e39ab..1d61f76fe46 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/include/thread_span_links.hpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/include/thread_span_links.hpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,7 @@ class ThreadSpanLinks ThreadSpanLinks& operator=(ThreadSpanLinks const&) = delete; void link_span(uint64_t thread_id, uint64_t span_id, uint64_t local_root_span_id, std::string span_type); - const Span* get_active_span_from_thread_id(uint64_t thread_id); + const std::optional get_active_span_from_thread_id(uint64_t thread_id); void reset(); static void postfork_child(); diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/stack_renderer.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/stack_renderer.cpp index 8abc37ee55a..9a07da3aac2 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/stack_renderer.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/stack_renderer.cpp @@ -55,8 +55,8 @@ StackRenderer::render_thread_begin(PyThreadState* tstate, ddup_push_threadinfo(sample, static_cast(thread_id), static_cast(native_id), name); ddup_push_walltime(sample, thread_state.wall_time_ns, 1); - const Span* active_span = ThreadSpanLinks::get_instance().get_active_span_from_thread_id(thread_id); - if (active_span != nullptr) { + const std::optional active_span = ThreadSpanLinks::get_instance().get_active_span_from_thread_id(thread_id); + if (active_span) { ddup_push_span_id(sample, active_span->span_id); ddup_push_local_root_span_id(sample, active_span->local_root_span_id); ddup_push_trace_type(sample, std::string_view(active_span->span_type)); diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp index 602aad9fdb3..80c5ef06bc1 100644 --- a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp +++ b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -19,15 +20,17 @@ ThreadSpanLinks::link_span(uint64_t thread_id, uint64_t span_id, uint64_t local_ thread_id_to_span[thread_id]->span_type = span_type; } -const Span* +const std::optional ThreadSpanLinks::get_active_span_from_thread_id(uint64_t thread_id) { std::lock_guard lock(mtx); - if (thread_id_to_span.find(thread_id) == thread_id_to_span.end()) { - return nullptr; + std::optional span; + auto it = thread_id_to_span.find(thread_id); + if (it != thread_id_to_span.end()) { + span = *(it->second); } - return thread_id_to_span[thread_id].get(); + return span; } void diff --git a/ddtrace/internal/datadog/profiling/stack_v2/test/CMakeLists.txt b/ddtrace/internal/datadog/profiling/stack_v2/test/CMakeLists.txt new file mode 100644 index 00000000000..23fcda3eedb --- /dev/null +++ b/ddtrace/internal/datadog/profiling/stack_v2/test/CMakeLists.txt @@ -0,0 +1,25 @@ +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG release-1.11.0) +set(gtest_force_shared_crt + ON + CACHE BOOL "" FORCE) +set(INSTALL_GTEST + OFF + CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) +include(GoogleTest) +include(AnalysisFunc) + +function(dd_wrapper_add_test name) + add_executable(${name} ${ARGN}) + target_include_directories(${name} PRIVATE ../include) + target_link_libraries(${name} PRIVATE gmock gtest_main _stack_v2) + add_ddup_config(${name}) + + gtest_discover_tests(${name}) +endfunction() + +# Add the tests +dd_wrapper_add_test(thread_span_links thread_span_links.cpp) diff --git a/ddtrace/internal/datadog/profiling/stack_v2/test/thread_span_links.cpp b/ddtrace/internal/datadog/profiling/stack_v2/test/thread_span_links.cpp new file mode 100644 index 00000000000..b668c3b3d80 --- /dev/null +++ b/ddtrace/internal/datadog/profiling/stack_v2/test/thread_span_links.cpp @@ -0,0 +1,48 @@ +#include + +#include +#include + +#include "thread_span_links.hpp" + +static void +get() +{ + for (int i = 0; i < 100; i++) { + std::string span_type; + for (int j = 0; j < i; j++) { + span_type.append("a"); + } + Datadog::ThreadSpanLinks::get_instance().link_span(42, 1, 2, span_type); + } +} + +static std::string +set() +{ + std::string s; + for (int i = 0; i < 100; i++) { + auto thing = Datadog::ThreadSpanLinks::get_instance().get_active_span_from_thread_id(42); + if (!thing) { + continue; + } + s = thing->span_type; + } + return s; +} + +TEST(ThreadSpanLinksConcurrency, GetSetRace) +{ + std::thread t1(get); + std::thread t2(set); + t1.join(); + t2.join(); +} + +int +main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + (void)(::testing::GTEST_FLAG(death_test_style) = "threadsafe"); + return RUN_ALL_TESTS(); +} diff --git a/releasenotes/notes/profiling-fix-data-race-segfault-b14f59c06c6bf9ed.yaml b/releasenotes/notes/profiling-fix-data-race-segfault-b14f59c06c6bf9ed.yaml new file mode 100644 index 00000000000..0dcae82aea8 --- /dev/null +++ b/releasenotes/notes/profiling-fix-data-race-segfault-b14f59c06c6bf9ed.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + profiling: fix a data race where span information associated with a thread + was read and updated concurrently, leading to segfaults