From 47061cea49523b3a426d7f0f54953e0dcfc7e2ed Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 29 Aug 2024 14:45:01 -0400
Subject: [PATCH 1/2] add pending state back to
 jl_thread_suspend_and_get_state-machine (#55622)

Fixes an issue with #55500, where signals may abruptly abort the process
as they observe it is still processing the resume SIGUSR2 message and
are not able to wait for that processing to end before setting the new
message to exit.

(cherry picked from commit da3468c1208b087161af5b69a26a92a91967a367)
---
 src/signals-unix.c | 65 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/src/signals-unix.c b/src/signals-unix.c
index 5732fd1e9c91d..a2b56952f71bd 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -413,6 +413,7 @@ static int signal_caught_cond = -1;
 
 int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
+    int err;
     pthread_mutex_lock(&in_signal_lock);
     jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -421,22 +422,51 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
         pthread_mutex_unlock(&in_signal_lock);
         return 0;
     }
-    sig_atomic_t request = 0;
-    if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
+    if (jl_atomic_load(&ptls2->signal_request) != 0) {
         // something is wrong, or there is already a usr2 in flight elsewhere
+        // try to wait for it to finish or wait for timeout
+        struct pollfd event = {signal_caught_cond, POLLIN, 0};
+        do {
+            err = poll(&event, 1, timeout * 1000);
+        } while (err == -1 && errno == EINTR);
+        if (err == -1 || (event.revents & POLLIN) == 0) {
+            // not ready after timeout: cancel this request
+            pthread_mutex_unlock(&in_signal_lock);
+            return 0;
+        }
+    }
+    // check for  any stale signal_caught_cond events
+    struct pollfd event = {signal_caught_cond, POLLIN, 0};
+    do {
+        err = poll(&event, 1, 0);
+    } while (err == -1 && errno == EINTR);
+    if (err == -1) {
         pthread_mutex_unlock(&in_signal_lock);
         return 0;
     }
+    if ((event.revents & POLLIN) != 0) {
+        // consume it before continuing
+        eventfd_t got;
+        do {
+            err = read(signal_caught_cond, &got, sizeof(eventfd_t));
+        } while (err == -1 && errno == EINTR);
+        if (err != sizeof(eventfd_t)) abort();
+        assert(got == 1); (void) got;
+    }
+    sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
+    assert(request == 0 || request == -1);
     request = 1;
-    int err = pthread_kill(ptls2->system_id, SIGUSR2);
-    // wait for thread to acknowledge or timeout
-    struct pollfd event = {signal_caught_cond, POLLIN, 0};
+    err = pthread_kill(ptls2->system_id, SIGUSR2);
     if (err == 0) {
+        // wait for thread to acknowledge or timeout
+        struct pollfd event = {signal_caught_cond, POLLIN, 0};
         do {
             err = poll(&event, 1, timeout * 1000);
         } while (err == -1 && errno == EINTR);
+        if (err != 1 || (event.revents & POLLIN) == 0)
+            err = -1;
     }
-    if ((event.revents & POLLIN) == 0) {
+    if (err == -1) {
         // not ready after timeout: try to cancel this request
         if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
             pthread_mutex_unlock(&in_signal_lock);
@@ -452,7 +482,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
     // Now the other thread is waiting on exit_signal_cond (verify that here by
     // checking it is 0, and add an acquire barrier for good measure)
     request = jl_atomic_load_acquire(&ptls2->signal_request);
-    assert(request == 0); (void) request;
+    assert(request == 0 || request == -1); (void) request;
     jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
     *ctx = *signal_context;
     return 1;
@@ -511,6 +541,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
 }
 
 // request:
+// -1: processing
 //  0: nothing [not from here]
 //  1: get state & wait for request
 //  2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -526,22 +557,36 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
     if (ptls == NULL)
         return;
     int errno_save = errno;
-    // acknowledge that we saw the signal_request
-    sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
+    sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
+    if (request == 0)
+        return;
+    if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
+        return;
     if (request == 1) {
         signal_context = jl_to_bt_context(ctx);
+        // acknowledge that we saw the signal_request and set signal_context
         int err;
         eventfd_t got = 1;
         err = write(signal_caught_cond, &got, sizeof(eventfd_t));
         if (err != sizeof(eventfd_t)) abort();
+        sig_atomic_t processing = -1;
+        jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
+        // wait for exit signal
         do {
             err = read(exit_signal_cond, &got, sizeof(eventfd_t));
         } while (err == -1 && errno == EINTR);
         if (err != sizeof(eventfd_t)) abort();
         assert(got == 1);
-        request = jl_atomic_exchange(&ptls->signal_request, 0);
+        request = jl_atomic_exchange(&ptls->signal_request, -1);
+        signal_context = NULL;
         assert(request == 2 || request == 3 || request == 4);
     }
+    int err;
+    eventfd_t got = 1;
+    err = write(signal_caught_cond, &got, sizeof(eventfd_t));
+    if (err != sizeof(eventfd_t)) abort();
+    sig_atomic_t processing = -1;
+    jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
     if (request == 2) {
         int force = jl_check_force_sigint();
         if (force || (!ptls->defer_signal && ptls->io_wait)) {

From 0495045cc11bf9088ace3e218157df6ffc08c2b8 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Sat, 7 Sep 2024 18:16:22 -0400
Subject: [PATCH 2/2] [Profile] fix threading issue (#55704)

I forgot about the existence of threads, so had hard-coded this to only
support one thread. Clearly that is not sufficient though, so use the
semaphore here as it is intended to be used.

Fixes #55703

---------

Co-authored-by: Ian Butterworth <i.r.butterworth@gmail.com>
(cherry picked from commit 4f0a333d9d76df76a6383ed2113e66c789d5ecee)
---
 src/signals-unix.c              | 24 ++++++++++--------------
 stdlib/Profile/test/runtests.jl |  3 ++-
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/signals-unix.c b/src/signals-unix.c
index a2b56952f71bd..2aafd335a68b8 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -410,6 +410,7 @@ pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
 static bt_context_t *signal_context; // protected by in_signal_lock
 static int exit_signal_cond = -1;
 static int signal_caught_cond = -1;
+static int signals_inflight = 0;
 
 int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
 {
@@ -422,7 +423,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
         pthread_mutex_unlock(&in_signal_lock);
         return 0;
     }
-    if (jl_atomic_load(&ptls2->signal_request) != 0) {
+    while (signals_inflight) {
         // something is wrong, or there is already a usr2 in flight elsewhere
         // try to wait for it to finish or wait for timeout
         struct pollfd event = {signal_caught_cond, POLLIN, 0};
@@ -434,25 +435,16 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
             pthread_mutex_unlock(&in_signal_lock);
             return 0;
         }
-    }
-    // check for  any stale signal_caught_cond events
-    struct pollfd event = {signal_caught_cond, POLLIN, 0};
-    do {
-        err = poll(&event, 1, 0);
-    } while (err == -1 && errno == EINTR);
-    if (err == -1) {
-        pthread_mutex_unlock(&in_signal_lock);
-        return 0;
-    }
-    if ((event.revents & POLLIN) != 0) {
         // consume it before continuing
         eventfd_t got;
         do {
             err = read(signal_caught_cond, &got, sizeof(eventfd_t));
         } while (err == -1 && errno == EINTR);
         if (err != sizeof(eventfd_t)) abort();
-        assert(got == 1); (void) got;
+        assert(signals_inflight >= got);
+        signals_inflight -= got;
     }
+    signals_inflight++;
     sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
     assert(request == 0 || request == -1);
     request = 1;
@@ -469,6 +461,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
     if (err == -1) {
         // not ready after timeout: try to cancel this request
         if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
+            signals_inflight--;
             pthread_mutex_unlock(&in_signal_lock);
             return 0;
         }
@@ -478,7 +471,9 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
         err = read(signal_caught_cond, &got, sizeof(eventfd_t));
     } while (err == -1 && errno == EINTR);
     if (err != sizeof(eventfd_t)) abort();
-    assert(got == 1); (void) got;
+    assert(signals_inflight >= got);
+    signals_inflight -= got;
+    signals_inflight++;
     // Now the other thread is waiting on exit_signal_cond (verify that here by
     // checking it is 0, and add an acquire barrier for good measure)
     request = jl_atomic_load_acquire(&ptls2->signal_request);
@@ -505,6 +500,7 @@ static void jl_try_deliver_sigint(void)
     jl_safepoint_enable_sigint();
     jl_wake_libuv();
     pthread_mutex_lock(&in_signal_lock);
+    signals_inflight++;
     jl_atomic_store_release(&ptls2->signal_request, 2);
     // This also makes sure `sleep` is aborted.
     pthread_kill(ptls2->system_id, SIGUSR2);
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index cbfdde61d7054..958f1fefb6981 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -168,7 +168,8 @@ let cmd = Base.julia_cmd()
         println("done")
         print(Profile.len_data())
         """
-    p = open(`$cmd -e $script`)
+    # use multiple threads here to ensure that profiling works with threading
+    p = open(`$cmd -t2 -e $script`)
     t = Timer(120) do t
         # should be under 10 seconds, so give it 2 minutes then report failure
         println("KILLING debuginfo registration test BY PROFILE TEST WATCHDOG\n")