Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SMP races with thread swap and abort #21903

Merged
merged 7 commits into from
Jan 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions arch/x86/core/intel64/locore.S
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ z_x86_switch:
movq $X86_KERNEL_CS, _thread_offset_to_cs(%rsi)
movq $X86_KERNEL_DS, _thread_offset_to_ss(%rsi)
#endif
/* Store the handle (i.e. our thread struct address) into the
* switch handle field, this is a synchronization signal that
* must occur after the last data from the old context is
* saved.
*/
movq %rsi, ___thread_t_switch_handle_OFFSET(%rsi)

movq %gs:__x86_tss64_t_ist1_OFFSET, %rsp

/* fall through to __resume */
Expand Down
1 change: 0 additions & 1 deletion boards/x86/qemu_x86/qemu_x86_64_defconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ CONFIG_SMP=y
CONFIG_MP_NUM_CPUS=2
CONFIG_X86_MMU=y
CONFIG_X86_VERY_EARLY_CONSOLE=y
CONFIG_MP_NUM_CPUS=1
5 changes: 4 additions & 1 deletion include/kernel_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@
/* Thread is being aborted (SMP only) */
#define _THREAD_ABORTING (BIT(5))

/* Thread was aborted in interrupt context (SMP only) */
#define _THREAD_ABORTED_IN_ISR (BIT(6))

/* Thread is present in the ready queue */
#define _THREAD_QUEUED (BIT(6))
#define _THREAD_QUEUED (BIT(7))

/* end - states */

Expand Down
31 changes: 25 additions & 6 deletions kernel/include/kernel_arch_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,31 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *pStack,
* 3) Set the stack pointer to the value provided in switch_to
* 4) Pop off all thread state from the stack we switched to and return.
*
* Some arches may implement thread->switch handle as a pointer to the thread
* itself, and save context somewhere in thread->arch. In this case, on initial
* context switch from the dummy thread, thread->switch handle for the outgoing
* thread is NULL. Instead of dereferencing switched_from all the way to get
* the thread pointer, subtract ___thread_t_switch_handle_OFFSET to obtain the
* thread pointer instead.
* Some arches may implement thread->switch handle as a pointer to the
* thread itself, and save context somewhere in thread->arch. In this
* case, on initial context switch from the dummy thread,
* thread->switch handle for the outgoing thread is NULL. Instead of
* dereferencing switched_from all the way to get the thread pointer,
* subtract ___thread_t_switch_handle_OFFSET to obtain the thread
* pointer instead. That is, such a scheme would have behavior like
* (in C pseudocode):
*
* void arch_switch(void *switch_to, void **switched_from)
* {
* struct k_thread *new = switch_to;
* struct k_thread *old = CONTAINER_OF(switched_from, struct k_thread,
* switch_handle);
*
* // save old context...
* *switched_from = old;
* // restore new context...
* }
*
* Note that, regardless of the underlying handle representation, the
* incoming switched_from pointer MUST be written through with a
* non-NULL value after all relevant thread state has been saved. The
* kernel uses this as a synchronization signal to be able to wait for
* switch completion from another CPU.
*
* @param switch_to Incoming thread's switch handle
* @param switched_from Pointer to outgoing thread's switch handle storage
Expand Down
38 changes: 38 additions & 0 deletions kernel/include/kswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,26 @@ void z_smp_release_global_lock(struct k_thread *thread);
/* context switching and scheduling-related routines */
#ifdef CONFIG_USE_SWITCH

/* There is an unavoidable SMP race when threads swap -- their thread
* record is in the queue (and visible to other CPUs) before
* arch_switch() finishes saving state. We must spin for the switch
* handle before entering a new thread. See docs on arch_switch().
*
* Note: future SMP architectures may need a fence/barrier or cache
* invalidation here. Current ones don't, and sadly Zephyr doesn't
* have a framework for that yet.
*/
static inline void wait_for_switch(struct k_thread *thread)
{
#ifdef CONFIG_SMP
volatile void **shp = (void *)&thread->switch_handle;

while (*shp == NULL) {
k_busy_wait(1);
}
#endif
}

/* New style context switching. arch_switch() is a lower level
* primitive that doesn't know about the scheduler or return value.
* Needed for SMP, where the scheduler requires spinlocking that we
Expand Down Expand Up @@ -57,8 +77,25 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
k_spin_release(lock);
}

#ifdef CONFIG_SMP
/* Null out the switch handle, see wait_for_switch() above.
* Note that we set it back to a non-null value if we are not
* switching! The value itself doesn't matter, because by
* definition _current is running and has no saved state.
*/
volatile void **shp = (void *)&old_thread->switch_handle;

*shp = NULL;
#endif

new_thread = z_get_next_ready_thread();

#ifdef CONFIG_SMP
if (new_thread == old_thread) {
*shp = old_thread;
}
#endif

if (new_thread != old_thread) {
sys_trace_thread_switched_out();
#ifdef CONFIG_TIMESLICING
Expand All @@ -77,6 +114,7 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
}
#endif
_current = new_thread;
wait_for_switch(new_thread);
arch_switch(new_thread->switch_handle,
&old_thread->switch_handle);

Expand Down
2 changes: 1 addition & 1 deletion kernel/queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ static inline void z_vrfy_k_queue_init(struct k_queue *queue)
#if !defined(CONFIG_POLL)
static void prepare_thread_to_run(struct k_thread *thread, void *data)
{
z_ready_thread(thread);
z_thread_return_value_set_with_data(thread, 0, data);
z_ready_thread(thread);
}
#endif /* CONFIG_POLL */

Expand Down
50 changes: 41 additions & 9 deletions kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
#include <stdbool.h>
#include <kernel_internal.h>

/* Maximum time between the time a self-aborting thread flags itself
* DEAD and the last read or write to its stack memory (i.e. the time
* of its next swap()). In theory this might be tuned per platform,
* but in practice this conservative value should be safe.
*/
#define THREAD_ABORT_DELAY_US 500

#if defined(CONFIG_SCHED_DUMB)
#define _priq_run_add z_priq_dumb_add
#define _priq_run_remove z_priq_dumb_remove
Expand Down Expand Up @@ -436,7 +443,20 @@ void z_thread_single_abort(struct k_thread *thread)
thread->base.pended_on = NULL;
}
}
thread->base.thread_state |= _THREAD_DEAD;

u32_t mask = _THREAD_DEAD;

/* If the abort is happening in interrupt context,
* that means that execution will never return to the
* thread's stack and that the abort is known to be
* complete. Otherwise the thread still runs a bit
* until it can swap, requiring a delay.
*/
if (IS_ENABLED(CONFIG_SMP) && arch_is_in_isr()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand this. z_thread_single_abort() can be called with a thread object running on another CPU, but we're checking if the current CPU is in an ISR. do we need to check the state of the CPU that the thread is running on?

mask |= _THREAD_ABORTED_IN_ISR;
}

thread->base.thread_state |= mask;
}

sys_trace_thread_abort(thread);
Expand Down Expand Up @@ -616,10 +636,9 @@ void z_thread_priority_set(struct k_thread *thread, int prio)
{
bool need_sched = z_set_prio(thread, prio);

if (IS_ENABLED(CONFIG_SMP) &&
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) {
z_sched_ipi();
}
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
arch_sched_ipi();
#endif

if (need_sched && _current->base.sched_locked == 0) {
z_reschedule_unlocked();
Expand Down Expand Up @@ -738,6 +757,8 @@ void *z_get_next_switch_handle(void *interrupted)
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) {
z_sched_ipi();
}

wait_for_switch(_current);
return _current->switch_handle;
}
#endif
Expand Down Expand Up @@ -1144,10 +1165,9 @@ void z_impl_k_wakeup(k_tid_t thread)
z_reschedule_unlocked();
}

if (IS_ENABLED(CONFIG_SMP) &&
!IS_ENABLED(CONFIG_SCHED_IPI_SUPPORTED)) {
z_sched_ipi();
}
#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_IPI_SUPPORTED)
arch_sched_ipi();
#endif
}

#ifdef CONFIG_SMP
Expand Down Expand Up @@ -1180,7 +1200,9 @@ void z_sched_abort(struct k_thread *thread)
* it locally. Not all architectures support that, alas. If
* we don't have it, we need to wait for some other interrupt.
*/
key = k_spin_lock(&sched_spinlock);
thread->base.thread_state |= _THREAD_ABORTING;
k_spin_unlock(&sched_spinlock, key);
#ifdef CONFIG_SCHED_IPI_SUPPORTED
arch_sched_ipi();
#endif
Expand All @@ -1204,6 +1226,16 @@ void z_sched_abort(struct k_thread *thread)
k_busy_wait(100);
}
}

/* If the thread self-aborted (e.g. its own exit raced with
* this external abort) then even though it is flagged DEAD,
* it's still running until its next swap and thus the thread
* object is still in use. We have to resort to a fallback
* delay in that circumstance.
*/
if ((thread->base.thread_state & _THREAD_ABORTED_IN_ISR) == 0U) {
k_busy_wait(THREAD_ABORT_DELAY_US);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel uneasy about this .. is there some state in the other thread we can poll in a loop, instead of just an arbitrary delay?

Also I'm assuming that this logic happens with interrupts unlocked?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is not, absent changing the way swap works internally. The thread stack is in use right up to the swap call, but the swapped-to context may have been interrupted, there is nowhere at the OS level to place code to run "after swap", it just swaps back to whatever it was doing. I tried to find a way to make this work such that the scheduler lock is held and synchronously released at swap time, but even then "synchronously" isn't synchronous enough, because the CPU is still running C code at the point where that spinlock gets released.

An architectural fix would have every call to z_swap set a value in the swapped-from thread struct after the last memory access to its data had occurred, then we could spin waiting for that.

And yes, this delay happens outside the lock.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking out loud here: if the race condition is two different contexts both trying to complete k_thread_abort() on the same thread object, could we introduce a per-thread object spinlock to synchronize this?

}
}
#endif

Expand Down
5 changes: 5 additions & 0 deletions kernel/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,11 @@ void z_setup_new_thread(struct k_thread *new_thread,
arch_new_thread(new_thread, stack, stack_size, entry, p1, p2, p3,
prio, options);

#ifdef CONFIG_SMP
/* switch handle must be non-null except when inside z_swap() */
new_thread->switch_handle = new_thread;
#endif

#ifdef CONFIG_THREAD_USERSPACE_LOCAL_DATA
#ifndef CONFIG_THREAD_USERSPACE_LOCAL_DATA_ARCH_DEFER_SETUP
/* don't set again if the arch's own code in arch_new_thread() has
Expand Down
21 changes: 9 additions & 12 deletions samples/userspace/shared_mem/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ struct k_thread ct_thread;
K_THREAD_STACK_DEFINE(ct_stack, STACKSIZE);

_app_enc_d char encMSG[] = "ENC!\n";
_app_enc_d int enc_state = 1;
_app_enc_b char enc_pt[50]; /* Copy form shared pt */
_app_enc_b char enc_ct[50]; /* Copy to shared ct */

Expand All @@ -104,6 +103,15 @@ void main(void)
struct k_mem_partition *dom0_parts[] = {&part0, &part1};
k_tid_t tPT, tENC, tCT;

fBUFIN = 0; /* clear flags */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change looks good but I'm still worried about how this was crashing by the emulator suddenly exiting. even with a race it should at least produce an exception that can be handled, if the whole thing triple faults that tells me we might have a bug in arch/x86 somewhere that I should try to investigate more, unless this race somehow got the page tables corrupted. what do you think?

fBUFOUT = 0;
calc_rev_wheel((BYTE *) &W1, (BYTE *)&W1R);
calc_rev_wheel((BYTE *) &W2, (BYTE *)&W2R);
calc_rev_wheel((BYTE *) &W3, (BYTE *)&W3R);
IW1 = 0;
IW2 = 0;
IW3 = 0;

k_thread_access_grant(k_current_get(), &allforone);

/*
Expand Down Expand Up @@ -169,17 +177,6 @@ void enc(void)
{

int index, index_out;
if (enc_state == 1) {
fBUFIN = 0; /* clear flags */
fBUFOUT = 0;
calc_rev_wheel((BYTE *) &W1, (BYTE *)&W1R);
calc_rev_wheel((BYTE *) &W2, (BYTE *)&W2R);
calc_rev_wheel((BYTE *) &W3, (BYTE *)&W3R);
IW1 = 0;
IW2 = 0;
IW3 = 0;
enc_state = 0;
}

while (1) {
k_sem_take(&allforone, K_FOREVER);
Expand Down