From 5aa7f67bd8f11132701a290cc7766f27282bd2fa Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 13 Nov 2021 17:12:24 -0800 Subject: [PATCH] Make profiling more robust with many tasks (#42978) This patch includes two sets of changes. (1) `jl_thread_suspend_and_get_state` uses `pthread_cond_timedwait` to recover from the case where the request is not received by the signal handler. This is required because `usr2_handler` contains some paths for the case where it is not possible to obtain `ptls`. (2) `ctx_switch` now makes sure to null out `ptls` of the last task (`lastt->ptls = NULL`) after changing the current task by updating pgcstack (`jl_set_pgcstack(&t->gcstack)`). This closes the gap in which `usr2_handler` can observe the null `ptls`. Co-authored-by: Jameson Nash (cherry picked from commit 81315807a5bb0c581d85d4fbaa4f09cf5dccfca4) --- src/signals-unix.c | 19 +++++++++++++++++- src/task.c | 5 +++-- test/profile_spawnmany_exec.jl | 14 +++++++++++++ test/threads.jl | 36 ++++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 test/profile_spawnmany_exec.jl diff --git a/src/signals-unix.c b/src/signals-unix.c index 617b6114da7ee..42f615c8ed64b 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON @@ -364,11 +365,25 @@ static pthread_cond_t signal_caught_cond; static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += 1; pthread_mutex_lock(&in_signal_lock); jl_ptls_t ptls2 = jl_all_tls_states[tid]; jl_atomic_store_release(&ptls2->signal_request, 1); pthread_kill(ptls2->system_id, SIGUSR2); - pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge + // wait for thread to acknowledge + int err = pthread_cond_timedwait(&signal_caught_cond, &in_signal_lock, &ts); + if (err == ETIMEDOUT) { + sig_atomic_t request = 1; + if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) { + *ctx = NULL; + pthread_mutex_unlock(&in_signal_lock); + return; + } + err = pthread_cond_wait(&signal_caught_cond, &in_signal_lock); + } + assert(!err); assert(jl_atomic_load_acquire(&ptls2->signal_request) == 0); *ctx = signal_context; } @@ -750,6 +765,8 @@ static void *signal_listener(void *arg) for (int i = jl_n_threads; i-- > 0; ) { // notify thread to stop jl_thread_suspend_and_get_state(i, &signal_context); + if (signal_context == NULL) + continue; // do backtrace on thread contexts for critical signals // this part must be signal-handler safe diff --git a/src/task.c b/src/task.c index ceb114617fc38..fb46b73b32ebd 100644 --- a/src/task.c +++ b/src/task.c @@ -388,18 +388,19 @@ static void ctx_switch(jl_task_t *lastt) else #endif *pt = NULL; // can't fail after here: clear the gc-root for the target task now - lastt->ptls = NULL; } // set up global state for new task and clear global state for old task t->ptls = ptls; ptls->current_task = t; JL_GC_PROMISE_ROOTED(t); + jl_signal_fence(); + jl_set_pgcstack(&t->gcstack); + jl_signal_fence(); lastt->ptls = NULL; #ifdef MIGRATE_TASKS ptls->previous_task = lastt; #endif - jl_set_pgcstack(&t->gcstack); #if defined(JL_TSAN_ENABLED) tsan_switch_to_ctx(&t->tsan_state); diff --git a/test/profile_spawnmany_exec.jl b/test/profile_spawnmany_exec.jl new file mode 100644 index 0000000000000..a061de40d5172 --- /dev/null +++ b/test/profile_spawnmany_exec.jl @@ -0,0 +1,14 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +using Profile + +function spawnmany(n) + if n > 2 + m = n รท 2 + t = Threads.@spawn spawnmany(m) + spawnmany(m) + wait(t) + end +end + +@profile spawnmany(parse(Int, get(ENV, "NTASKS", "2000000"))) diff --git a/test/threads.jl b/test/threads.jl index 736cecada3cd8..1e4c4b4f6a5f3 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -147,3 +147,39 @@ end # We don't need the watchdog anymore close(proc.in) + +# https://github.com/JuliaLang/julia/pull/42973 +Sys.islinux() && @testset "spawn and wait *a lot* of tasks in @profile" begin + # Not using threads_exec.jl for better isolation, reproducibility, and a + # tighter timeout. + script = "profile_spawnmany_exec.jl" + cmd = `$(Base.julia_cmd()) --depwarn=error --rr-detach --startup-file=no $script` + @testset for n in [20000, 200000, 2000000] + proc = run(ignorestatus(setenv(cmd, "NTASKS" => n; dir = @__DIR__)); wait = false) + done = Threads.Atomic{Bool}(false) + timeout = false + timer = Timer(100) do _ + timeout = true + for sig in [Base.SIGTERM, Base.SIGHUP, Base.SIGKILL] + for _ in 1:1000 + kill(proc, sig) + if done[] + if sig != Base.SIGTERM + @warn "Terminating `$script` required signal $sig" + end + return + end + sleep(0.001) + end + end + end + try + wait(proc) + finally + done[] = true + close(timer) + end + @test success(proc) + @test !timeout + end +end