From 11e96286fc2c32b68554778bda73991b0bd9f3f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:20 -1000 Subject: [PATCH 1/8] scx: Make scx_kf_allowed_on_arg_tasks() actually use @mask The function takes @mask but ignores it and uses hard-coded __SCX_KF_RQ_LOCKED instead. Fix it. The only user was calling with __SCX_KF_RQ_LOCKED, so this doesn't cause any behavior changes. --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index fc695b13efb05..afda22f52002c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1085,7 +1085,7 @@ static __always_inline bool scx_kf_allowed(u32 mask) static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED)) + if (!scx_kf_allowed(mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && From e4af34e52c572a8a87476bfccdd3de0221caf211 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:20 -1000 Subject: [PATCH 2/8] scx: Make scx_bpf_dsq_nr_queued() safe to call from any context It's currently marked ops-only but can actually be called from any context. Let's make it safe. --- kernel/sched/ext.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index afda22f52002c..a394ab8757501 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5714,28 +5714,36 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) * @dsq_id: id of the DSQ * * Return the number of tasks in the DSQ matching @dsq_id. If not found, - * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops - * operations. + * -%ENOENT is returned. */ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) { struct scx_dispatch_q *dsq; + s32 ret; - lockdep_assert(rcu_read_lock_any_held()); + preempt_disable(); if (dsq_id == SCX_DSQ_LOCAL) { - return this_rq()->scx.local_dsq.nr; + ret = this_rq()->scx.local_dsq.nr; + goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (ops_cpu_valid(cpu, NULL)) - return cpu_rq(cpu)->scx.local_dsq.nr; + if (ops_cpu_valid(cpu, NULL)) { + ret = cpu_rq(cpu)->scx.local_dsq.nr; + goto out; + } } else { dsq = find_non_local_dsq(dsq_id); - if (dsq) - return dsq->nr; + if (dsq) { + ret = dsq->nr; + goto out; + } } - return -ENOENT; + ret = -ENOENT; +out: + preempt_enable(); + return ret; } /** From 42f0ae2f942a371585f9c7b3192090e2e79e4882 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:20 -1000 Subject: [PATCH 3/8] scx: Remove scx_kfunc_set_ops_only None of the functions in scx_kfunc_set_ops_only set are ops-only. scx_bpf_select_cpu_dfl() is select_cpu-only and the rest can be called from anywhere. Remove the set and move the kfuncs to scx_kfunc_set_any and the new scx_kfunc_set_select_cpu. No functional changes intended. --- kernel/sched/ext.c | 258 +++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 127 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a394ab8757501..525c04ef0754a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5354,6 +5354,44 @@ static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { .set = &scx_kfunc_ids_sleepable, }; +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { + *is_idle = false; + return prev_cpu; + } + + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -5746,91 +5784,6 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) return ret; } -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return false; - } - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return -EBUSY; - } - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - /** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy @@ -5845,48 +5798,8 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) destroy_dsq(dsq_id); } -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; - } - - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -} - __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(scx_kfunc_ids_ops_only) -BTF_ID_FLAGS(func, scx_bpf_kick_cpu) -BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_ops_only) - -static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_ops_only, -}; - struct scx_bpf_error_bstr_bufs { u64 data[MAX_BPRINTF_VARARGS]; char msg[SCX_EXIT_MSG_LEN]; @@ -6148,6 +6061,91 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ } +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return false; + } + + if (ops_cpu_valid(cpu, NULL)) + return test_and_clear_cpu_idle(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return -EBUSY; + } + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + /** * scx_bpf_task_running - Is task currently running? * @p: task of interest @@ -6204,6 +6202,9 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) @@ -6216,6 +6217,9 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) #ifdef CONFIG_CGROUP_SCHED @@ -6245,14 +6249,14 @@ static int __init scx_init(void) */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_sleepable)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_select_cpu)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cpu_release)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_ops_only)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_any)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, From 3971557011abd12011e41b3e1001f6857bff100f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:20 -1000 Subject: [PATCH 4/8] scx: Rename scx_dispatch_q node field names We'll use the list_head for both fifo and priq. Let's rename it to list. No functional changes intended. --- include/linux/sched/ext.h | 12 +++++++----- init/init_task.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/ext.c | 26 +++++++++++++------------- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index dd113d4a95ada..fae765203cb93 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -51,13 +51,15 @@ enum scx_dsq_id_flags { }; /* - * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the - * scheduler core and the BPF scheduler. See the documentation for more details. + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. */ struct scx_dispatch_q { raw_spinlock_t lock; - struct list_head fifo; /* processed in dispatching order */ - struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */ + struct list_head list; /* tasks in dispatch order */ + struct rb_root_cached priq; /* used to order by p->scx.dsq_vtime */ u32 nr; u64 id; struct rhash_head hash_node; @@ -126,7 +128,7 @@ enum scx_kf_mask { struct sched_ext_entity { struct scx_dispatch_q *dsq; struct { - struct list_head fifo; /* dispatch order */ + struct list_head list; /* dispatch order */ struct rb_node priq; /* p->scx.dsq_vtime order */ } dsq_node; u32 flags; /* protected by rq lock */ diff --git a/init/init_task.c b/init/init_task.c index a32c606cc81ba..5441363f98a7c 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -101,7 +101,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_SCHED_CLASS_EXT .scx = { - .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo), + .dsq_node.list = LIST_HEAD_INIT(init_task.scx.dsq_node.list), .flags = 0, .sticky_cpu = -1, .holding_cpu = -1, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae7039676d313..19420fa4750e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4586,7 +4586,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHED_CLASS_EXT p->scx.dsq = NULL; - INIT_LIST_HEAD(&p->scx.dsq_node.fifo); + INIT_LIST_HEAD(&p->scx.dsq_node.list); RB_CLEAR_NODE(&p->scx.dsq_node.priq); p->scx.flags = 0; p->scx.weight = 0; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 525c04ef0754a..a2fe422b4ab4f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1384,7 +1384,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, { bool is_local = dsq->id == SCX_DSQ_LOCAL; - WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo)); + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list)); WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); @@ -1417,14 +1417,14 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ - if (unlikely(!list_empty(&dsq->fifo))) + if (unlikely(!list_empty(&dsq->list))) scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", dsq->id); } else { if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) - list_add(&p->scx.dsq_node.fifo, &dsq->fifo); + list_add(&p->scx.dsq_node.list, &dsq->list); else - list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo); + list_add_tail(&p->scx.dsq_node.list, &dsq->list); /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ if (unlikely(rb_first_cached(&dsq->priq))) scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", @@ -1475,13 +1475,13 @@ static void task_unlink_from_dsq(struct task_struct *p, RB_CLEAR_NODE(&p->scx.dsq_node.priq); p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; } else { - list_del_init(&p->scx.dsq_node.fifo); + list_del_init(&p->scx.dsq_node.list); } } static bool task_linked_on_dsq(struct task_struct *p) { - return !list_empty(&p->scx.dsq_node.fifo) || + return !list_empty(&p->scx.dsq_node.list) || !RB_EMPTY_NODE(&p->scx.dsq_node.priq); } @@ -2028,12 +2028,12 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, struct rq *task_rq; bool moved = false; retry: - if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq)) + if (list_empty(&dsq->list) && !rb_first_cached(&dsq->priq)) return false; raw_spin_lock(&dsq->lock); - list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) { + list_for_each_entry(p, &dsq->list, scx.dsq_node.list) { task_rq = task_rq(p); if (rq == task_rq) goto this_rq; @@ -2058,7 +2058,7 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, /* @dsq is locked and @p is on this rq */ WARN_ON_ONCE(p->scx.holding_cpu >= 0); task_unlink_from_dsq(p, dsq); - list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo); + list_add_tail(&p->scx.dsq_node.list, &scx_rq->local_dsq.list); dsq->nr--; scx_rq->local_dsq.nr++; p->scx.dsq = &scx_rq->local_dsq; @@ -2584,7 +2584,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) * can find the task unless it wants to trigger a separate * follow-up scheduling event. */ - if (list_empty(&rq->scx.local_dsq.fifo)) + if (list_empty(&rq->scx.local_dsq.list)) do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); else do_enqueue_task(rq, p, 0, -1); @@ -2594,8 +2594,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) static struct task_struct *first_local_task(struct rq *rq) { WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq)); - return list_first_entry_or_null(&rq->scx.local_dsq.fifo, - struct task_struct, scx.dsq_node.fifo); + return list_first_entry_or_null(&rq->scx.local_dsq.list, + struct task_struct, scx.dsq_node.list); } static struct task_struct *pick_next_task_scx(struct rq *rq) @@ -3669,7 +3669,7 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); - INIT_LIST_HEAD(&dsq->fifo); + INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; } From 87a3eb71394d653330bd4d4899690c0452c27f0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:21 -1000 Subject: [PATCH 5/8] scx: Use uncached rbtree for dsq priq priq will updated so that the tasks are added to the list in vtime order too. Thus, caching the first node won't be necessary. Let's switch to uncached variant. --- include/linux/sched/ext.h | 2 +- kernel/sched/ext.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index fae765203cb93..29cb680703916 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -59,7 +59,7 @@ enum scx_dsq_id_flags { struct scx_dispatch_q { raw_spinlock_t lock; struct list_head list; /* tasks in dispatch order */ - struct rb_root_cached priq; /* used to order by p->scx.dsq_vtime */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; u64 id; struct rhash_head hash_node; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a2fe422b4ab4f..18f17b6485a2b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1414,8 +1414,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, if (enq_flags & SCX_ENQ_DSQ_PRIQ) { p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; - rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, - scx_dsq_priq_less); + rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ if (unlikely(!list_empty(&dsq->list))) scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", @@ -1426,7 +1425,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, else list_add_tail(&p->scx.dsq_node.list, &dsq->list); /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ - if (unlikely(rb_first_cached(&dsq->priq))) + if (unlikely(rb_first(&dsq->priq))) scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); } @@ -1471,7 +1470,7 @@ static void task_unlink_from_dsq(struct task_struct *p, struct scx_dispatch_q *dsq) { if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { - rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq); + rb_erase(&p->scx.dsq_node.priq, &dsq->priq); RB_CLEAR_NODE(&p->scx.dsq_node.priq); p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; } else { @@ -2028,7 +2027,7 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, struct rq *task_rq; bool moved = false; retry: - if (list_empty(&dsq->list) && !rb_first_cached(&dsq->priq)) + if (list_empty(&dsq->list) && !rb_first(&dsq->priq)) return false; raw_spin_lock(&dsq->lock); @@ -2041,7 +2040,7 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, goto remote_rq; } - for (rb_node = rb_first_cached(&dsq->priq); rb_node; + for (rb_node = rb_first(&dsq->priq); rb_node; rb_node = rb_next(rb_node)) { p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); task_rq = task_rq(p); @@ -2593,7 +2592,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) static struct task_struct *first_local_task(struct rq *rq) { - WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq)); + WARN_ON_ONCE(rb_first(&rq->scx.local_dsq.priq)); return list_first_entry_or_null(&rq->scx.local_dsq.list, struct task_struct, scx.dsq_node.list); } From 2bbcff2b49b3e8bebe5542a82bf272cded0ab0aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:21 -1000 Subject: [PATCH 6/8] scx: Always keep tasks on dsq->list When a DSQ is used as a PRIQ, tasks were only kept on dsq->priq. This requires all other paths to check both dsq->priq and dsq->list. A task's scx.dsq_node.list is available when the task is on a PRIQ DSQ anyway, let's always put tasks on dsq->list whether it's used as a PRIQ or FIFO by inserting into dsq->list in vtime order on PRIQs. This means that dsq->list is always in dispatch order whether the DSQ is FIFO or PRIQ and the only paths that need to worry about the DSQ type are dispatch_enqueue() and task_unlink_from_dsq(). --- kernel/sched/ext.c | 60 +++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 18f17b6485a2b..8857f54a5cee0 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1413,21 +1413,44 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } if (enq_flags & SCX_ENQ_DSQ_PRIQ) { - p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; - rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); - /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ - if (unlikely(!list_empty(&dsq->list))) + struct rb_node *rbp; + + /* + * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are + * linked to both the rbtree and list on PRIQs, this can only be + * tested easily when adding the first task. + */ + if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && + !list_empty(&dsq->list))) scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", dsq->id); + + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); + + /* + * Find the previous task and insert after it on the list so + * that @dsq->list is vtime ordered. + */ + rbp = rb_prev(&p->scx.dsq_node.priq); + if (rbp) { + struct task_struct *prev = + container_of(rbp, struct task_struct, + scx.dsq_node.priq); + list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list); + } else { + list_add(&p->scx.dsq_node.list, &dsq->list); + } } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ + if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) list_add(&p->scx.dsq_node.list, &dsq->list); else list_add_tail(&p->scx.dsq_node.list, &dsq->list); - /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ - if (unlikely(rb_first(&dsq->priq))) - scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", - dsq->id); } dsq->nr++; p->scx.dsq = dsq; @@ -1473,15 +1496,14 @@ static void task_unlink_from_dsq(struct task_struct *p, rb_erase(&p->scx.dsq_node.priq, &dsq->priq); RB_CLEAR_NODE(&p->scx.dsq_node.priq); p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; - } else { - list_del_init(&p->scx.dsq_node.list); } + + list_del_init(&p->scx.dsq_node.list); } static bool task_linked_on_dsq(struct task_struct *p) { - return !list_empty(&p->scx.dsq_node.list) || - !RB_EMPTY_NODE(&p->scx.dsq_node.priq); + return !list_empty(&p->scx.dsq_node.list); } static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) @@ -2023,11 +2045,10 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, { struct scx_rq *scx_rq = &rq->scx; struct task_struct *p; - struct rb_node *rb_node; struct rq *task_rq; bool moved = false; retry: - if (list_empty(&dsq->list) && !rb_first(&dsq->priq)) + if (list_empty(&dsq->list)) return false; raw_spin_lock(&dsq->lock); @@ -2040,16 +2061,6 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, goto remote_rq; } - for (rb_node = rb_first(&dsq->priq); rb_node; - rb_node = rb_next(rb_node)) { - p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); - task_rq = task_rq(p); - if (rq == task_rq) - goto this_rq; - if (task_can_run_on_rq(p, rq)) - goto remote_rq; - } - raw_spin_unlock(&dsq->lock); return false; @@ -2592,7 +2603,6 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) static struct task_struct *first_local_task(struct rq *rq) { - WARN_ON_ONCE(rb_first(&rq->scx.local_dsq.priq)); return list_first_entry_or_null(&rq->scx.local_dsq.list, struct task_struct, scx.dsq_node.list); } From ca1cf720ce791f7f119422988e43abe996cbdc3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:21 -1000 Subject: [PATCH 7/8] scx: Cosmetic / trivial changes - Relocate scx_task_iter definition so that it's together with iter functions. - Add find_user_dsq() instead of open-coding rhashtable_lookup_fast(). No functional changes. --- kernel/sched/ext.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857f54a5cee0..cb801fa7f4313 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -929,13 +929,6 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, #define scx_ops_error(fmt, args...) \ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) -struct scx_task_iter { - struct sched_ext_entity cursor; - struct task_struct *locked; - struct rq *rq; - struct rq_flags rf; -}; - #define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) static long jiffies_delta_msecs(unsigned long at, unsigned long now) @@ -1097,6 +1090,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, return true; } +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; +}; + /** * scx_task_iter_init - Initialize a task iterator * @iter: iterator to init @@ -1552,6 +1552,11 @@ static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) { lockdep_assert(rcu_read_lock_any_held()); @@ -1559,8 +1564,7 @@ static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) if (dsq_id == SCX_DSQ_GLOBAL) return &scx_dsq_global; else - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, - dsq_hash_params); + return find_user_dsq(dsq_id); } static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, @@ -3723,7 +3727,7 @@ static void destroy_dsq(u64 dsq_id) rcu_read_lock(); - dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + dsq = find_user_dsq(dsq_id); if (!dsq) goto out_unlock_rcu; From 9a13ab9323bc8b49b6de3a5622fc633a4c839d81 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Apr 2024 15:46:21 -1000 Subject: [PATCH 8/8] scx: Move p->scx.dsq_flags into p->scx.dsq_node and give p->scx.dsq_node's type the name of scx_dsq_node. This will be used to implement DSQ iterator. This wastes 8 bytes on 64bit. Oh well. --- include/linux/sched/ext.h | 12 +++++++----- kernel/sched/ext.c | 10 +++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 29cb680703916..793efa68fe1de 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -121,18 +121,20 @@ enum scx_kf_mask { __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; +struct scx_dsq_node { + struct list_head list; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + u32 flags; /* SCX_TASK_DSQ_* flags */ +}; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. */ struct sched_ext_entity { struct scx_dispatch_q *dsq; - struct { - struct list_head list; /* dispatch order */ - struct rb_node priq; /* p->scx.dsq_vtime order */ - } dsq_node; + struct scx_dsq_node dsq_node; /* protected by dsq lock */ u32 flags; /* protected by rq lock */ - u32 dsq_flags; /* protected by dsq lock */ u32 weight; s32 sticky_cpu; s32 holding_cpu; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index cb801fa7f4313..46fd51b7a602e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1385,7 +1385,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, bool is_local = dsq->id == SCX_DSQ_LOCAL; WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list)); - WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) || !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); if (!is_local) { @@ -1425,7 +1425,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", dsq->id); - p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ; rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); /* @@ -1492,10 +1492,10 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, static void task_unlink_from_dsq(struct task_struct *p, struct scx_dispatch_q *dsq) { - if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) { rb_erase(&p->scx.dsq_node.priq, &dsq->priq); RB_CLEAR_NODE(&p->scx.dsq_node.priq); - p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ; } list_del_init(&p->scx.dsq_node.list); @@ -4318,7 +4318,7 @@ static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker, seq_buf_printf(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu\n", scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, - p->scx.dsq_flags, + p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); seq_buf_printf(s, " sticky/holding_cpu=%d/%d dsq_id=%s\n",