Skip to content

Commit

Permalink
rearrange jl_delete_thread to be thread-safe
Browse files Browse the repository at this point in the history
Prior to this, especially on macOS, the gc-safepoint here would cause
the process to segfault as we had already freed the current_task state.
Rearrange this code so that the GC interactions (except for the atomic
store to current_task) are all handled before entering GC safe, and then
signaling the thread is deleted (via setting current_task = NULL,
published by jl_unlock_profile_wr to other threads) is last.

```
ERROR: Exception handler triggered on unmanaged thread.
Process 53827 stopped
* thread #5, stop reason = EXC_BAD_ACCESS (code=2, address=0x100018008)
    frame #0: 0x0000000100b74344 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_set(ptls=0x000000011f8b3200, state='\x02', old_state=<unavailable>) at julia_threads.h:272:9 [opt]
   269 	    assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD);
   270 	    jl_atomic_store_release(&ptls->gc_state, state);
   271 	    if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE)
-> 272 	        jl_gc_safepoint_(ptls);
   273 	    return old_state;
   274 	}
   275 	STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
Target 0: (julia) stopped.
(lldb) up
frame #1: 0x0000000100b74320 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_save_and_set(ptls=0x000000011f8b3200, state='\x02') at julia_threads.h:278:12 [opt]
   275 	STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls,
   276 	                                              int8_t state)
   277 	{
-> 278 	    return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state));
   279 	}
   280 	#ifdef __clang_gcanalyzer__
   281 	// these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically)
(lldb)
frame #2: 0x0000000100b7431c libjulia-internal.1.12.0.dylib`jl_delete_thread(value=0x000000011f8b3200) at threading.c:537:11 [opt]
   534 	    ptls->root_task = NULL;
   535 	    jl_free_thread_gc_state(ptls);
   536 	    // then park in safe-region
-> 537 	    (void)jl_gc_safe_enter(ptls);
   538 	}
```
  • Loading branch information
vtjnash committed Sep 23, 2024
1 parent 3876d43 commit 49ead81
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 21 deletions.
46 changes: 25 additions & 21 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,30 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
// prior unsafe-region (before we let it release the stack memory)
(void)jl_gc_unsafe_enter(ptls);
scheduler_delete_thread(ptls);
// need to clear pgcstack and eh, but we can clear everything now too
jl_task_t *ct = jl_atomic_load_relaxed(&ptls->current_task);
jl_task_frame_noreturn(ct);
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
// the system will probably free this stack memory soon
// so prevent any other thread from accessing it later
if (ct != ptls->root_task)
jl_task_frame_noreturn(ptls->root_task);
}
else {
// Uh oh. The user cleared the sticky bit so it started running
// elsewhere, then called pthread_exit on this thread from another
// Task, which will free the stack memory of that root task soon. This
// is not recoverable. Though we could just hang here, a fatal message
// is likely better.
jl_safe_printf("fatal: thread exited from wrong Task.\n");
abort();
}
ptls->previous_exception = NULL;
// allow the page root_task is on to be freed
ptls->root_task = NULL;
jl_free_thread_gc_state(ptls);
// park in safe-region from here on (this may run GC again)
(void)jl_gc_safe_enter(ptls);
// try to free some state we do not need anymore
#ifndef _OS_WINDOWS_
void *signal_stack = ptls->signal_stack;
Expand Down Expand Up @@ -502,21 +526,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
#else
pthread_mutex_lock(&in_signal_lock);
#endif
// need to clear pgcstack and eh, but we can clear everything now too
jl_task_frame_noreturn(jl_atomic_load_relaxed(&ptls->current_task));
if (jl_set_task_tid(ptls->root_task, ptls->tid)) {
// the system will probably free this stack memory soon
// so prevent any other thread from accessing it later
jl_task_frame_noreturn(ptls->root_task);
}
else {
// Uh oh. The user cleared the sticky bit so it started running
// elsewhere, then called pthread_exit on this thread. This is not
// recoverable. Though we could just hang here, a fatal message is better.
jl_safe_printf("fatal: thread exited from wrong Task.\n");
abort();
}
jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead
jl_atomic_store_relaxed(&ptls->current_task, NULL); // indicate dead
// finally, release all of the locks we had grabbed
#ifdef _OS_WINDOWS_
jl_unlock_profile_wr();
Expand All @@ -529,12 +539,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
#endif
free(ptls->bt_data);
small_arraylist_free(&ptls->locks);
ptls->previous_exception = NULL;
// allow the page root_task is on to be freed
ptls->root_task = NULL;
jl_free_thread_gc_state(ptls);
// then park in safe-region
(void)jl_gc_safe_enter(ptls);
}

//// debugging hack: if we are exiting too fast for error message printing on threads,
Expand Down
2 changes: 2 additions & 0 deletions test/threads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,11 @@ let e = Base.Event(true),
onces = Vector{Vector{Nothing}}(undef, length(tids))
for i = 1:length(tids)
function cl()
GC.gc(false) # stress test the GC-safepoint mechanics of jl_adopt_thread
local y = once()
onces[i] = y
@test x !== y === once()
GC.gc(false) # stress test the GC-safepoint mechanics of jl_delete_thread
nothing
end
function threadcallclosure(cl::F) where {F} # create sparam so we can reference the type of cl in the ccall type
Expand Down

0 comments on commit 49ead81

Please sign in to comment.