From 4bbb07ccaa10a13ad5ea8ff0d3fe45ffaaa831dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jan 2024 13:47:22 -1000
Subject: [PATCH 1/2] scx: Narrow cpus_read_lock() critical section in
 scx_ops_enable()

cpus_read_lock() is needed for two purposes in scx_ops_enable(). First, to
keep CPUs stable between ops.init() and enabling of ops.cpu_on/offline().
Second, to work around the locking order issue between scx_cgroup_rwsem and
cpu_hotplug_lock caused by static_branch_*().

Currently, scx_ops_enable() acquires cpus_read_lock() and holds it through
most of ops enabling covering both use cases. This makes it difficult to
understand what lock is held where and resolve locking order issues among
these system-wide locks.

Let's separate out the two sections so that ops.init() and
ops.cpu_on/offline() enabling are contained in its own critical section and
cpus_read_lock() is droped and then reacquired for the second use case.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 42 +++++++++++++++++--------------
 kernel/sched/ext.c        | 53 ++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e629686cf06218..ae552129931a9c 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -412,24 +412,6 @@ struct sched_ext_ops {
 	 */
 	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
 
-	/**
-	 * cpu_online - A CPU became online
-	 * @cpu: CPU which just came up
-	 *
-	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
-	 * associated with other CPUs beforehand.
-	 */
-	void (*cpu_online)(s32 cpu);
-
-	/**
-	 * cpu_offline - A CPU is going offline
-	 * @cpu: CPU which is going offline
-	 *
-	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
-	 * associated with other CPUs afterwards.
-	 */
-	void (*cpu_offline)(s32 cpu);
-
 	/**
 	 * init_task - Initialize a task to run in a BPF scheduler
 	 * @p: task to initialize for BPF scheduling
@@ -550,7 +532,29 @@ struct sched_ext_ops {
 #endif	/* CONFIG_CGROUPS */
 
 	/*
-	 * All online ops must come before ops.init().
+	 * All online ops must come before ops.cpu_online().
+	 */
+
+	/**
+	 * cpu_online - A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * cpu_offline - A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
+	 * associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
+	/*
+	 * All CPU hotplug ops must come before ops.init().
 	 */
 
 	/**
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dd09b53254f592..f8889a82267ae6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,10 +9,15 @@
 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
 
 enum scx_internal_consts {
-	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
-	SCX_DSP_DFL_MAX_BATCH	= 32,
-	SCX_DSP_MAX_LOOPS	= 32,
-	SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+	SCX_OPI_BEGIN			= 0,
+	SCX_OPI_NORMAL_BEGIN		= 0,
+	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
+	SCX_OPI_END			= SCX_OP_IDX(init),
+	SCX_DSP_DFL_MAX_BATCH		= 32,
+	SCX_DSP_MAX_LOOPS		= 32,
+	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
 };
 
 enum scx_ops_enable_state {
@@ -104,8 +109,8 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
 DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
-struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
-	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
+struct static_key_false scx_has_op[SCX_OPI_END] =
+	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 
 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
 static struct scx_exit_info scx_exit_info;
@@ -3228,7 +3233,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable_cpuslocked(&__scx_ops_enabled);
-	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
 		static_branch_disable_cpuslocked(&scx_has_op[i]);
 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
@@ -3373,13 +3378,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   scx_create_rt_helper("sched_ext_ops_helper"));
 		if (!scx_ops_helper) {
 			ret = -ENOMEM;
-			goto err_unlock;
+			goto err;
 		}
 	}
 
 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err;
 	}
 
 	/*
@@ -3408,7 +3413,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
-			goto err_disable;
+			goto err_disable_unlock_cpus;
 		}
 
 		/*
@@ -3420,9 +3425,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		 * ops.exit() like other scx_bpf_error() invocations.
 		 */
 		if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE)
-			goto err_disable;
+			goto err_disable_unlock_cpus;
 	}
 
+	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+		if (((void (**)(void))ops)[i])
+			static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+	cpus_read_unlock();
+
 	ret = validate_ops(ops);
 	if (ret)
 		goto err_disable;
@@ -3450,10 +3461,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * Lock out forks, cgroup on/offlining and moves before opening the
 	 * floodgate so that they don't wander into the operations prematurely.
 	 */
+	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
 
-	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
+	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
 
@@ -3478,7 +3490,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable_unlock;
+		goto err_disable_unlock_all;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3504,7 +3516,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable_unlock;
+			goto err_disable_unlock_all;
 		}
 
 		put_task_struct(p);
@@ -3528,7 +3540,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable_unlock;
+		goto err_disable_unlock_all;
 	}
 
 	/*
@@ -3564,6 +3576,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	preempt_enable();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+	cpus_read_unlock();
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3571,24 +3584,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	}
 
 	if (scx_switch_all_req)
-		static_branch_enable_cpuslocked(&__scx_switched_all);
+		static_branch_enable(&__scx_switched_all);
 
-	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
 
 	return 0;
 
-err_unlock:
+err:
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable_unlock:
+err_disable_unlock_all:
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
-err_disable:
+err_disable_unlock_cpus:
 	cpus_read_unlock();
+err_disable:
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);

From 1225a9069ff2185007b078b7e447727d229a64fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jan 2024 13:47:22 -1000
Subject: [PATCH 2/2] scx: Reorder scx_fork_rwsem, cpu_hotplug_lock and
 scx_cgroup_rwsem

scx_cgroup_rwsem and scx_fork_rwsem, respectively, are in the following
locking dependency chain.

  cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
  scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock

And we need to flip static_key which requires CPUs stable. The only locking
order which satifies all three requirements is

  scx_fork_rwsem --> cpu_hotplug_lock --> scx_cgroup_rwsem

Reorder locking in scx_ops_enable() and scx_ops_disable_workfn().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f8889a82267ae6..4ae373743911a1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3201,9 +3201,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable(&__scx_switched_all);
 	WRITE_ONCE(scx_switching_all, false);
 
-	/* avoid racing against fork and cgroup changes */
-	cpus_read_lock();
+	/*
+	 * Avoid racing against fork and cgroup changes. See scx_ops_enable()
+	 * for explanation on the locking order.
+	 */
 	percpu_down_write(&scx_fork_rwsem);
+	cpus_read_lock();
 	scx_cgroup_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
@@ -3244,8 +3247,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_cgroup_exit();
 
 	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3460,9 +3463,23 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
 	 * floodgate so that they don't wander into the operations prematurely.
+	 *
+	 * We don't need to keep the CPUs stable but static_branch_*() requires
+	 * cpus_read_lock() and scx_cgroup_rwsem must nest inside
+	 * cpu_hotplug_lock because of the following dependency chain:
+	 *
+	 *   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
+	 *
+	 * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
+	 * static_branch_*_cpuslocked().
+	 *
+	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
+	 * following dependency chain:
+	 *
+	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
 	 */
-	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
+	cpus_read_lock();
 	scx_cgroup_lock();
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
@@ -3575,8 +3592,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
 	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;