Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-1.6] backport signal handling improvements from #40056 #41774

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
jl_ptls_t ptls = jl_get_ptls_states();
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size);
jl_critical_error(0, NULL);
abort();
}
// not reached
Expand Down
32 changes: 31 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,36 @@ void __tsan_switch_to_fiber(void *fiber, unsigned flags);
# define JL_USE_IFUNC 0
#endif

// If we've smashed the stack, (and not just normal NORETURN)
// this will smash stack-unwind too
#ifdef _OS_WINDOWS_
#if defined(_CPU_X86_64_)
// install the unhandled exception handler at the top of our stack
// to call directly into our personality handler
#define CFI_NORETURN \
asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text");
#else
#define CFI_NORETURN
#endif
#else
// wipe out the call-stack unwind capability beyond this function
// (we are noreturn, so it is not a total lie)
#if defined(_CPU_X86_64_)
// per nongnu libunwind: "x86_64 ABI specifies that end of call-chain is marked with a NULL RBP or undefined return address"
// so we do all 3, to be extra certain of it
#define CFI_NORETURN \
asm volatile ("\t.cfi_undefined rip"); \
asm volatile ("\t.cfi_undefined rbp"); \
asm volatile ("\t.cfi_return_column rbp");
#else
// per nongnu libunwind: "DWARF spec says undefined return address location means end of stack"
// we use whatever happens to be register 1 on this platform for this
#define CFI_NORETURN \
asm volatile ("\t.cfi_undefined 1"); \
asm volatile ("\t.cfi_return_column 1");
#endif
#endif

// If this is detected in a backtrace of segfault, it means the functions
// that use this value must be reworked into their async form with cb arg
// provided and with JL_UV_LOCK used around the calls
Expand Down Expand Up @@ -904,7 +934,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
#endif
JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size);
void jl_critical_error(int sig, bt_context_t *context);
JL_DLLEXPORT void jl_raise_debugger(void);
int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;
Expand Down
41 changes: 35 additions & 6 deletions src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx)
#endif
}

// what to do on a critical error
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size)
// what to do on a critical error on a thread
void jl_critical_error(int sig, bt_context_t *context)
{
// This function is not allowed to reference any TLS variables.
// We need to explicitly pass in the TLS buffer pointer when
// we make `jl_filename` and `jl_lineno` thread local.

jl_ptls_t ptls = jl_get_ptls_states();
jl_bt_element_t *bt_data = ptls->bt_data;
size_t *bt_size = &ptls->bt_size;
size_t i, n = *bt_size;
if (sig)
if (sig) {
// kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit)
ptls->pgcstack = NULL;
ptls->safe_restore = NULL;
if (ptls->current_task) {
ptls->current_task->eh = NULL;
ptls->current_task->excstack = NULL;
}
#ifndef _OS_WINDOWS_
sigset_t sset;
sigemptyset(&sset);
// n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals
// on all threads but SIGABRT. But we also don't know what the thread
// was doing, so unblock all critical signals so that they will crash
// hard, and not just get stuck.
sigaddset(&sset, SIGSEGV);
sigaddset(&sset, SIGBUS);
sigaddset(&sset, SIGILL);
// also unblock fatal signals now, so we won't get back here twice
sigaddset(&sset, SIGTERM);
sigaddset(&sset, SIGABRT);
sigaddset(&sset, SIGQUIT);
// and the original signal is now fatal too, in case it wasn't
// something already listed (?)
if (sig != SIGINT)
sigaddset(&sset, sig);
pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
#endif
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
}
jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno);
if (context) {
// Must avoid extended backtrace frames here unless we're sure bt_data
Expand Down
122 changes: 82 additions & 40 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,16 @@ extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *);
void *mach_segv_listener(void *arg)
{
(void)arg;
(void)jl_get_ptls_states();
while (1) {
int ret = mach_msg_server(exc_server, 2048, segv_port, MACH_MSG_TIMEOUT_NONE);
jl_safe_printf("mach_msg_server: %s\n", mach_error_string(ret));
jl_exit(128 + SIGSEGV);
}
}

static void allocate_segv_handler()

static void allocate_mach_handler()
{
// ensure KEYMGR_GCC3_DW2_OBJ_LIST is initialized, as this requires malloc
// and thus can deadlock when used without first initializing it.
Expand Down Expand Up @@ -122,7 +124,7 @@ static void allocate_segv_handler()
jl_error("pthread_create failed");
}
pthread_attr_destroy(&attr);
for (int16_t tid = 0;tid < jl_n_threads;tid++) {
for (int16_t tid = 0; tid < jl_n_threads; tid++) {
attach_exception_port(pthread_mach_thread_np(jl_all_tls_states[tid]->system_id), 0);
}
}
Expand Down Expand Up @@ -164,19 +166,31 @@ typedef arm_exception_state64_t host_exception_state_t;
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
void (*fptr)(void))
{
uint64_t rsp = (uint64_t)ptls2->signal_stack + sig_stack_size;
#ifdef _CPU_X86_64_
uintptr_t rsp = state->__rsp;
#elif defined(_CPU_AARCH64_)
uintptr_t rsp = state->__sp;
#else
#error "julia: throw-in-context not supported on this platform"
#endif
if (ptls2->signal_stack == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
}
else {
rsp = (uintptr_t)ptls2->signal_stack + sig_stack_size;
}
assert(rsp % 16 == 0);

// push (null) $RIP onto the stack
rsp -= sizeof(void*);
*(void**)rsp = NULL;

#ifdef _CPU_X86_64_
rsp -= sizeof(void*);
state->__rsp = rsp; // set stack pointer
state->__rip = (uint64_t)fptr; // "call" the function
#else
#elif defined(_CPU_AARCH64_)
state->__sp = rsp;
state->__pc = (uint64_t)fptr;
state->__lr = 0;
#else
#error "julia: throw-in-context not supported on this platform"
#endif
}

Expand All @@ -194,11 +208,22 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
ptls2->sig_exception = exception;
}
jl_call_in_state(ptls2, &state, &jl_sig_throw);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);
}

static void segv_handler(int sig, siginfo_t *info, void *context)
{
jl_ptls_t ptls = jl_get_ptls_states();
assert(sig == SIGSEGV || sig == SIGBUS);
if (ptls->safe_restore) { // restarting jl_ or jl_unwind_stepn
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
}
else {
sigdie_handler(sig, info, context);
}
}

//exc_server uses dlsym to find symbol
JL_DLLEXPORT
kern_return_t catch_exception_raise(mach_port_t exception_port,
Expand All @@ -208,18 +233,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
exception_data_t code,
mach_msg_type_number_t code_count)
{
unsigned int count = THREAD_STATE_COUNT;
unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT;
host_exception_state_t exc_state;
host_thread_state_t state;
#ifdef LIBOSXUNWIND
#ifdef LLVMLIBUNWIND
if (thread == mach_profiler_thread) {
return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
}
#endif
int16_t tid;
jl_ptls_t ptls2 = NULL;
for (tid = 0;tid < jl_n_threads;tid++) {
for (tid = 0; tid < jl_n_threads; tid++) {
jl_ptls_t _ptls2 = jl_all_tls_states[tid];
if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
ptls2 = _ptls2;
Expand Down Expand Up @@ -288,11 +311,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
return KERN_SUCCESS;
}
else {
kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count);
HANDLE_MACH_ERROR("thread_get_state", ret);
jl_critical_error(SIGSEGV, (unw_context_t*)&state,
ptls2->bt_data, &ptls2->bt_size);
return KERN_INVALID_ARGUMENT;
jl_exit_thread0(128 + SIGSEGV, NULL, 0);
return KERN_SUCCESS;
}
}

Expand All @@ -307,24 +327,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
}

static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
{
jl_ptls_t ptls2 = jl_all_tls_states[tid];
mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id);
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);

kern_return_t ret = thread_suspend(tid_port);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

// Do the actual sampling
unsigned int count = THREAD_STATE_COUNT;
static unw_context_t state;
memset(&state, 0, sizeof(unw_context_t));
memset(ctx, 0, sizeof(*ctx));

// Get the state of the suspended thread
ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count);
ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count);
}

// Initialize the unwind context with the suspend thread's state
*ctx = &state;
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
{
static host_thread_state_t state;
jl_thread_suspend_and_get_state2(tid, &state);
*ctx = (unw_context_t*)&state;
}

static void jl_thread_resume(int tid, int sig)
Expand Down Expand Up @@ -366,29 +389,46 @@ static void jl_try_deliver_sigint(void)
HANDLE_MACH_ERROR("thread_resume", ret);
}

static void jl_exit_thread0(int exitstate)
static void JL_NORETURN jl_exit_thread0_cb(int exitstate)
{
CFI_NORETURN
jl_critical_error(exitstate - 128, NULL);
jl_exit(exitstate);
}

static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
{
jl_ptls_t ptls2 = jl_all_tls_states[0];
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
kern_return_t ret = thread_suspend(thread);
HANDLE_MACH_ERROR("thread_suspend", ret);

host_thread_state_t state;
jl_thread_suspend_and_get_state2(0, &state);
unw_context_t *uc = (unw_context_t*)&state;

// This aborts `sleep` and other syscalls.
ret = thread_abort(thread);
kern_return_t ret = thread_abort(thread);
HANDLE_MACH_ERROR("thread_abort", ret);

unsigned int count = THREAD_STATE_COUNT;
host_thread_state_t state;
ret = thread_get_state(thread, THREAD_STATE,
(thread_state_t)&state, &count);
if (bt_data == NULL) {
// Must avoid extended backtrace frames here unless we're sure bt_data
// is properly rooted.
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL);
}
else {
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
}

void (*exit_func)(int) = &_exit;
if (thread0_exit_count <= 1) {
exit_func = &jl_exit;
exit_func = &jl_exit_thread0_cb;
}
else if (thread0_exit_count == 2) {
exit_func = &exit;
}
else {
exit_func = &_exit;
}

#ifdef _CPU_X86_64_
// First integer argument. Not portable but good enough =)
Expand All @@ -399,8 +439,8 @@ static void jl_exit_thread0(int exitstate)
#error Fill in first integer argument here
#endif
jl_call_in_state(ptls2, &state, (void (*)(void))exit_func);
ret = thread_set_state(thread, THREAD_STATE,
(thread_state_t)&state, count);
unsigned int count = THREAD_STATE_COUNT;
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
HANDLE_MACH_ERROR("thread_set_state", ret);

ret = thread_resume(thread);
Expand Down Expand Up @@ -498,8 +538,10 @@ void *mach_profile_listener(void *arg)
break;
}

unw_context_t *uc;
jl_thread_suspend_and_get_state(i, &uc);
host_thread_state_t state;
jl_thread_suspend_and_get_state2(i, &state);
unw_context_t *uc = (unw_context_t*)&state;

if (running) {
#ifdef LIBOSXUNWIND
/*
Expand Down
Loading