-
Notifications
You must be signed in to change notification settings - Fork 6.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix SMP races with thread swap and abort #21903
Changes from all commits
ad44159
5297606
4d98b8a
102d28f
946919c
78b1402
dbaf43a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,4 +17,3 @@ CONFIG_SMP=y | |
CONFIG_MP_NUM_CPUS=2 | ||
CONFIG_X86_MMU=y | ||
CONFIG_X86_VERY_EARLY_CONSOLE=y | ||
CONFIG_MP_NUM_CPUS=1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,13 @@ | |
#include <stdbool.h> | ||
#include <kernel_internal.h> | ||
|
||
/* Maximum time between the time a self-aborting thread flags itself | ||
* DEAD and the last read or write to its stack memory (i.e. the time | ||
* of its next swap()). In theory this might be tuned per platform, | ||
* but in practice this conservative value should be safe. | ||
*/ | ||
#define THREAD_ABORT_DELAY_US 500 | ||
|
||
#if defined(CONFIG_SCHED_DUMB) | ||
#define _priq_run_add z_priq_dumb_add | ||
#define _priq_run_remove z_priq_dumb_remove | ||
|
@@ -436,7 +443,20 @@ void z_thread_single_abort(struct k_thread *thread) | |
thread->base.pended_on = NULL; | ||
} | ||
} | ||
thread->base.thread_state |= _THREAD_DEAD; | ||
|
||
u32_t mask = _THREAD_DEAD; | ||
|
||
/* If the abort is happening in interrupt context, | ||
* that means that execution will never return to the | ||
* thread's stack and that the abort is known to be | ||
* complete. Otherwise the thread still runs a bit | ||
* until it can swap, requiring a delay. | ||
*/ | ||
if (IS_ENABLED(CONFIG_SMP) && arch_is_in_isr()) { | ||
mask |= _THREAD_ABORTED_IN_ISR; | ||
} | ||
|
||
thread->base.thread_state |= mask; | ||
} | ||
|
||
sys_trace_thread_abort(thread); | ||
|
@@ -616,10 +636,9 @@ void z_thread_priority_set(struct k_thread *thread, int prio) | |
{ | ||
bool need_sched = z_set_prio(thread, prio); | ||
|
||
if (IS_ENABLED(CONFIG_SMP) && | ||
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) { | ||
z_sched_ipi(); | ||
} | ||
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED) | ||
arch_sched_ipi(); | ||
#endif | ||
|
||
if (need_sched && _current->base.sched_locked == 0) { | ||
z_reschedule_unlocked(); | ||
|
@@ -738,6 +757,8 @@ void *z_get_next_switch_handle(void *interrupted) | |
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) { | ||
z_sched_ipi(); | ||
} | ||
|
||
wait_for_switch(_current); | ||
return _current->switch_handle; | ||
} | ||
#endif | ||
|
@@ -1144,10 +1165,9 @@ void z_impl_k_wakeup(k_tid_t thread) | |
z_reschedule_unlocked(); | ||
} | ||
|
||
if (IS_ENABLED(CONFIG_SMP) && | ||
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) { | ||
z_sched_ipi(); | ||
} | ||
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED) | ||
arch_sched_ipi(); | ||
#endif | ||
} | ||
|
||
#ifdef CONFIG_SMP | ||
|
@@ -1180,7 +1200,9 @@ void z_sched_abort(struct k_thread *thread) | |
* it locally. Not all architectures support that, alas. If | ||
* we don't have it, we need to wait for some other interrupt. | ||
*/ | ||
key = k_spin_lock(&sched_spinlock); | ||
thread->base.thread_state |= _THREAD_ABORTING; | ||
k_spin_unlock(&sched_spinlock, key); | ||
#ifdef CONFIG_SCHED_IPI_SUPPORTED | ||
arch_sched_ipi(); | ||
#endif | ||
|
@@ -1204,6 +1226,16 @@ void z_sched_abort(struct k_thread *thread) | |
k_busy_wait(100); | ||
} | ||
} | ||
|
||
/* If the thread self-aborted (e.g. its own exit raced with | ||
* this external abort) then even though it is flagged DEAD, | ||
* it's still running until its next swap and thus the thread | ||
* object is still in use. We have to resort to a fallback | ||
* delay in that circumstance. | ||
*/ | ||
if ((thread->base.thread_state & _THREAD_ABORTED_IN_ISR) == 0U) { | ||
k_busy_wait(THREAD_ABORT_DELAY_US); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel uneasy about this .. is there some state in the other thread we can poll in a loop, instead of just an arbitrary delay? Also I'm assuming that this logic happens with interrupts unlocked? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is not, absent changing the way swap works internally. The thread stack is in use right up to the swap call, but the swapped-to context may have been interrupted, there is nowhere at the OS level to place code to run "after swap", it just swaps back to whatever it was doing. I tried to find a way to make this work such that the scheduler lock is held and synchronously released at swap time, but even then "synchronously" isn't synchronous enough, because the CPU is still running C code at the point where that spinlock gets released. An architectural fix would have every call to z_swap set a value in the swapped-from thread struct after the last memory access to its data had occurred, then we could spin waiting for that. And yes, this delay happens outside the lock. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking out loud here: if the race condition is two different contexts both trying to complete k_thread_abort() on the same thread object, could we introduce a per-thread object spinlock to synchronize this? |
||
} | ||
} | ||
#endif | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,7 +78,6 @@ struct k_thread ct_thread; | |
K_THREAD_STACK_DEFINE(ct_stack, STACKSIZE); | ||
|
||
_app_enc_d char encMSG[] = "ENC!\n"; | ||
_app_enc_d int enc_state = 1; | ||
_app_enc_b char enc_pt[50]; /* Copy form shared pt */ | ||
_app_enc_b char enc_ct[50]; /* Copy to shared ct */ | ||
|
||
|
@@ -104,6 +103,15 @@ void main(void) | |
struct k_mem_partition *dom0_parts[] = {&part0, &part1}; | ||
k_tid_t tPT, tENC, tCT; | ||
|
||
fBUFIN = 0; /* clear flags */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change looks good but I'm still worried about how this was crashing by the emulator suddenly exiting. even with a race it should at least produce an exception that can be handled, if the whole thing triple faults that tells me we might have a bug in arch/x86 somewhere that I should try to investigate more, unless this race somehow got the page tables corrupted. what do you think? |
||
fBUFOUT = 0; | ||
calc_rev_wheel((BYTE *) &W1, (BYTE *)&W1R); | ||
calc_rev_wheel((BYTE *) &W2, (BYTE *)&W2R); | ||
calc_rev_wheel((BYTE *) &W3, (BYTE *)&W3R); | ||
IW1 = 0; | ||
IW2 = 0; | ||
IW3 = 0; | ||
|
||
k_thread_access_grant(k_current_get(), &allforone); | ||
|
||
/* | ||
|
@@ -169,17 +177,6 @@ void enc(void) | |
{ | ||
|
||
int index, index_out; | ||
if (enc_state == 1) { | ||
fBUFIN = 0; /* clear flags */ | ||
fBUFOUT = 0; | ||
calc_rev_wheel((BYTE *) &W1, (BYTE *)&W1R); | ||
calc_rev_wheel((BYTE *) &W2, (BYTE *)&W2R); | ||
calc_rev_wheel((BYTE *) &W3, (BYTE *)&W3R); | ||
IW1 = 0; | ||
IW2 = 0; | ||
IW3 = 0; | ||
enc_state = 0; | ||
} | ||
|
||
while (1) { | ||
k_sem_take(&allforone, K_FOREVER); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I understand this. z_thread_single_abort() can be called with a thread object running on another CPU, but we're checking if the current CPU is in an ISR. do we need to check the state of the CPU that the thread is running on?