From dad3fb67ca1cbef87ce700e83a55835e5921ce8a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 27 Dec 2023 15:18:05 +0100 Subject: [PATCH 1/2] kernfs: convert kernfs_idr_lock to an irq safe raw spinlock bpf_cgroup_from_id() (provided by sched-ext) needs to acquire kernfs_idr_lock and it can be used in the scheduler dispatch path with rq->_lock held. But any kernfs function that is acquiring kernfs_idr_lock can be interrupted by a scheduler tick, that would try to acquire rq->_lock, triggering the following deadlock scenario: CPU0 CPU1 ---- ---- lock(kernfs_idr_lock); lock(rq->__lock); lock(kernfs_idr_lock); lock(rq->__lock); More in general, considering that bpf_cgroup_from_id() is provided as a kfunc, potentially similar deadlock conditions can be triggered from any kprobe/tracepoint/fentry. For this reason, in order to prevent any potential deadlock scenario, convert kernfs_idr_lock to a raw irq safe spinlock. Signed-off-by: Andrea Righi --- fs/kernfs/dir.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8b2bd65d70e72..9ce7d2872b554 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ -static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ +static DEFINE_RAW_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -539,6 +539,7 @@ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; + unsigned long flags; if (!kn || !atomic_dec_and_test(&kn->count)) return; @@ -563,9 +564,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, flags); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -607,6 +608,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *kn; u32 id_highbits; int ret; + unsigned long irqflags; name = kstrdup_const(name, GFP_KERNEL); if (!name) @@ -617,13 +619,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, goto err_out1; idr_preload(GFP_KERNEL); - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags); idr_preload_end(); if (ret < 0) goto err_out2; @@ -659,9 +661,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -702,8 +704,9 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); + unsigned long flags; - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, flags); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) @@ -727,10 +730,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); return kn; err_unlock: - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); return NULL; } From 6b747e0ee5fca284330d065a0c777d1991290bc4 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 27 Dec 2023 17:25:54 +0100 Subject: [PATCH 2/2] sched_ext: fix race in scx_move_task() with exiting tasks There is a race with exiting tasks in scx_move_tasks() where we may fail to check for autogroup tasks, leading to the following oops: WARNING: CPU: 2 PID: 100 at kernel/sched/ext.c:2571 scx_move_task+0x9f/0xb0 ... Sched_ext: flatcg (enabled+all), task: runnable_at=-5ms RIP: 0010:scx_move_task+0x9f/0xb0 Call Trace: ? scx_move_task+0x9f/0xb0 ? __warn+0x85/0x170 ? scx_move_task+0x9f/0xb0 ? report_bug+0x171/0x1a0 ? handle_bug+0x3b/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? scx_move_task+0x9f/0xb0 sched_move_task+0x104/0x300 do_exit+0x37d/0xb70 ? lock_release+0xbe/0x270 do_group_exit+0x37/0xa0 __x64_sys_exit_group+0x18/0x20 do_syscall_64+0x44/0xf0 entry_SYSCALL_64_after_hwframe+0x6f/0x77 And a related NULL pointer dereference afterwards: BUG: kernel NULL pointer dereference, address: 0000000000000148 Prevent this by skipping scx_move_tasks() actions for exiting tasks. Moreover, make scx_move_tasks() more reliable by triggering only the WARN_ON_ONCE() and returning, instead of triggering also the bug afterwards. Signed-off-by: Andrea Righi --- kernel/sched/ext.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 53ee906aa2b68..634fcb7cb2431 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2560,15 +2560,22 @@ void scx_move_task(struct task_struct *p) /* * We're called from sched_move_task() which handles both cgroup and * autogroup moves. Ignore the latter. + * + * Also ignore exiting tasks, because in the exit path tasks transition + * from the autogroup to the root group, so task_group_is_autogroup() + * alone isn't able to catch exiting autogroup tasks. This is safe for + * cgroup_move(), because cgroup migrations never happen for PF_EXITING + * tasks. */ - if (task_group_is_autogroup(task_group(p))) + if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p))) return; if (!scx_enabled()) return; if (SCX_HAS_OP(cgroup_move)) { - WARN_ON_ONCE(!p->scx.cgrp_moving_from); + if (WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + return; SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); }