From dad3fb67ca1cbef87ce700e83a55835e5921ce8a Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 27 Dec 2023 15:18:05 +0100
Subject: [PATCH 1/2] kernfs: convert kernfs_idr_lock to an irq safe raw
 spinlock

bpf_cgroup_from_id() (provided by sched-ext) needs to acquire
kernfs_idr_lock and it can be used in the scheduler dispatch path with
rq->_lock held.

But any kernfs function that is acquiring kernfs_idr_lock can be
interrupted by a scheduler tick, that would try to acquire rq->_lock,
triggering the following deadlock scenario:

        CPU0                    CPU1
        ----                    ----
   lock(kernfs_idr_lock);
                                lock(rq->__lock);
                                lock(kernfs_idr_lock);
   <Interrupt>
    lock(rq->__lock);

More in general, considering that bpf_cgroup_from_id() is provided as a
kfunc, potentially similar deadlock conditions can be triggered from any
kprobe/tracepoint/fentry.

For this reason, in order to prevent any potential deadlock scenario,
convert kernfs_idr_lock to a raw irq safe spinlock.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 fs/kernfs/dir.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e72..9ce7d2872b554 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
  */
 static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
-static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
+static DEFINE_RAW_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -539,6 +539,7 @@ void kernfs_put(struct kernfs_node *kn)
 {
 	struct kernfs_node *parent;
 	struct kernfs_root *root;
+	unsigned long flags;
 
 	if (!kn || !atomic_dec_and_test(&kn->count))
 		return;
@@ -563,9 +564,9 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs, NULL);
 		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
 	}
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
@@ -607,6 +608,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	u32 id_highbits;
 	int ret;
+	unsigned long irqflags;
 
 	name = kstrdup_const(name, GFP_KERNEL);
 	if (!name)
@@ -617,13 +619,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 		goto err_out1;
 
 	idr_preload(GFP_KERNEL);
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
 	ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
 	if (ret >= 0 && ret < root->last_id_lowbits)
 		root->id_highbits++;
 	id_highbits = root->id_highbits;
 	root->last_id_lowbits = ret;
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
@@ -659,9 +661,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	return kn;
 
  err_out3:
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
  err_out2:
 	kmem_cache_free(kernfs_node_cache, kn);
  err_out1:
@@ -702,8 +704,9 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	struct kernfs_node *kn;
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
+	unsigned long flags;
 
-	spin_lock(&kernfs_idr_lock);
+	raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -727,10 +730,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	return kn;
 err_unlock:
-	spin_unlock(&kernfs_idr_lock);
+	raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
 	return NULL;
 }
 

From 6b747e0ee5fca284330d065a0c777d1991290bc4 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 27 Dec 2023 17:25:54 +0100
Subject: [PATCH 2/2] sched_ext: fix race in scx_move_task() with exiting tasks

There is a race with exiting tasks in scx_move_tasks() where we may fail
to check for autogroup tasks, leading to the following oops:

 WARNING: CPU: 2 PID: 100 at kernel/sched/ext.c:2571 scx_move_task+0x9f/0xb0
 ...
 Sched_ext: flatcg (enabled+all), task: runnable_at=-5ms
 RIP: 0010:scx_move_task+0x9f/0xb0
 Call Trace:
  <TASK>
  ? scx_move_task+0x9f/0xb0
  ? __warn+0x85/0x170
  ? scx_move_task+0x9f/0xb0
  ? report_bug+0x171/0x1a0
  ? handle_bug+0x3b/0x70
  ? exc_invalid_op+0x17/0x70
  ? asm_exc_invalid_op+0x1a/0x20
  ? scx_move_task+0x9f/0xb0
  sched_move_task+0x104/0x300
  do_exit+0x37d/0xb70
  ? lock_release+0xbe/0x270
  do_group_exit+0x37/0xa0
  __x64_sys_exit_group+0x18/0x20
  do_syscall_64+0x44/0xf0
  entry_SYSCALL_64_after_hwframe+0x6f/0x77

And a related NULL pointer dereference afterwards:

 BUG: kernel NULL pointer dereference, address: 0000000000000148

Prevent this by skipping scx_move_tasks() actions for exiting tasks.

Moreover, make scx_move_tasks() more reliable by triggering only the
WARN_ON_ONCE() and returning, instead of triggering also the bug
afterwards.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 kernel/sched/ext.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 53ee906aa2b68..634fcb7cb2431 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2560,15 +2560,22 @@ void scx_move_task(struct task_struct *p)
 	/*
 	 * We're called from sched_move_task() which handles both cgroup and
 	 * autogroup moves. Ignore the latter.
+	 *
+	 * Also ignore exiting tasks, because in the exit path tasks transition
+	 * from the autogroup to the root group, so task_group_is_autogroup()
+	 * alone isn't able to catch exiting autogroup tasks. This is safe for
+	 * cgroup_move(), because cgroup migrations never happen for PF_EXITING
+	 * tasks.
 	 */
-	if (task_group_is_autogroup(task_group(p)))
+	if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p)))
 		return;
 
 	if (!scx_enabled())
 		return;
 
 	if (SCX_HAS_OP(cgroup_move)) {
-		WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+		if (WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+			return;
 		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
 			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
 	}