scx: fix NULL pointer dereference with scx_exit_info

Trying to load multiple schedulers at the same time can trigger the following NULL pointer dereference: [22053.942960] BUG: kernel NULL pointer dereference, address: 0000000000000000 [22053.942966] #PF: supervisor write access in kernel mode [22053.942968] #PF: error_code(0x0002) - not-present page [22053.942969] PGD 0 P4D 0 [22053.942972] Oops: 0002 [#1] PREEMPT SMP NOPTI [22053.942976] CPU: 6 PID: 3550 Comm: sched_ext_ops_h Tainted: G O 6.7.0-4-generic #4+scx3-Ubuntu [22053.942978] Hardware name: GPD G1621-02/G1621-02, BIOS 2.04 09/01/2022 [22053.942980] Sched_ext: central (enabled+all) [22053.942981] RIP: 0010:scx_ops_disable_workfn+0x85/0x610 [22053.942987] Code: 89 df f3 48 ab b9 01 00 00 00 83 fa 01 0f 86 c3 00 00 00 89 d0 f0 0f b1 0d 60 ff 11 02 0f 85 10 05 00 00 48 8b 85 90 fe ff ff <89> 10 81 fa 00 04 00 00 0f 84 ef 04 00 00 0f 8e cb 00 00 00 48 c7 [22053.942989] RSP: 0018:ffffad420257fd30 EFLAGS: 00010246 [22053.942991] RAX: 0000000000000000 RBX: ffffad420257fd58 RCX: 0000000000000001 [22053.942993] RDX: 0000000000000040 RSI: 0000000000000000 RDI: ffffad420257fd98 [22053.942994] RBP: ffffad420257fea8 R08: 0000000000000000 R09: 0000000000000000 [22053.942996] R10: 0000000000000000 R11: 0000000000000000 R12: ffffad420257fd98 [22053.942998] R13: ffff96a4d07eddc0 R14: ffff96a4d07eddc4 R15: ffffffff8897c3b0 [22053.942999] FS: 0000000000000000(0000) GS:ffff96a7dfb00000(0000) knlGS:0000000000000000 [22053.943002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [22053.943003] CR2: 0000000000000000 CR3: 00000001f6a3c001 CR4: 0000000000f70ef0 [22053.943005] PKRU: 55555554 [22053.943006] Call Trace: [22053.943008] <TASK> [22053.943012] ? show_regs+0x6d/0x80 [22053.943016] ? __die+0x24/0x80 [22053.943018] ? page_fault_oops+0x99/0x1b0 [22053.943021] ? do_user_addr_fault+0x2ee/0x6b0 [22053.943024] ? exc_page_fault+0x83/0x1b0 [22053.943028] ? asm_exc_page_fault+0x27/0x30 [22053.943030] ? __pfx_scx_ops_disable_workfn+0x10/0x10 [22053.943034] ? scx_ops_disable_workfn+0x85/0x610 [22053.943036] ? asm_sysvec_irq_work+0x1b/0x20 [22053.943042] ? __pfx_scx_ops_disable_workfn+0x10/0x10 [22053.943043] kthread_worker_fn+0x9e/0x230 [22053.943048] ? __pfx_kthread_worker_fn+0x10/0x10 [22053.943050] kthread+0xef/0x120 [22053.943053] ? __pfx_kthread+0x10/0x10 [22053.943056] ret_from_fork+0x44/0x70 [22053.943058] ? __pfx_kthread+0x10/0x10 [22053.943061] ret_from_fork_asm+0x1b/0x30 [22053.943064] </TASK> This happens because in scx_ops_enable(), if a scheduler is already running, we are freeing scx_exit_info, that is still owned by the running scheduler. Therefore, as soon as we stop the running scheduler we can hit the NULL pointer dereference. Reproducer: - start any scheduler - try to start another scheduler - stop the running scheduler - BUG Fix this by not freeing scx_exit_info in error path of scx_ops_enable() when there is a running scheduler. Moreover, also make sure to access scx_exit_info with the scx_ops_enable_mutex held in scx_ops_disable_workfn(), to prevent other potential race conditions. Fixes: 26ae1b0 ("scx: Make scx_exit_info fields dynamically allocated") Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
sched-ext · Jan 26, 2024 · 57d57d8 · 57d57d8
1 parent 58ac752
commit 57d57d8
Showing 1 changed file with 20 additions and 13 deletions.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
@@ -3272,7 +3272,7 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 
 static void scx_ops_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = scx_exit_info;
+	struct scx_exit_info *ei;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
@@ -3291,14 +3291,27 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
+
+	/* guarantee forward progress by bypassing scx_ops */
+	scx_ops_bypass(true);
+
+	/*
+	 * Here, every runnable task is guaranteed to make forward progress and
+	 * we can safely use blocking synchronization constructs. Actually
+	 * disable ops.
+	 */
+	mutex_lock(&scx_ops_enable_mutex);
+
+	ei = scx_exit_info;
+	if (!ei) {
+		mutex_unlock(&scx_ops_enable_mutex);
+		goto done;
+	}
 	ei->kind = kind;
 	ei->reason = scx_exit_reason(ei->kind);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
-	/* guarantee forward progress by bypassing scx_ops */
-	scx_ops_bypass(true);
-
 	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
 	case SCX_OPS_DISABLING:
 		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
@@ -3313,13 +3326,6 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		break;
 	}
 
-	/*
-	 * Here, every runnable task is guaranteed to make forward progress and
-	 * we can safely use blocking synchronization constructs. Actually
-	 * disable ops.
-	 */
-	mutex_lock(&scx_ops_enable_mutex);
-
 	static_branch_disable(&__scx_switched_all);
 	WRITE_ONCE(scx_switching_all, false);
 
@@ -3614,13 +3620,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   scx_create_rt_helper("sched_ext_ops_helper"));
 		if (!scx_ops_helper) {
 			ret = -ENOMEM;
-			goto err;
+			goto err_unlock;
 		}
 	}
 
 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
 		ret = -EBUSY;
-		goto err;
+		goto err_unlock;
 	}
 
 	scx_exit_info = alloc_exit_info();
@@ -3868,6 +3874,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		free_exit_info(scx_exit_info);
 		scx_exit_info = NULL;
 	}
+err_unlock:
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;