From 234eb2c19d60ad472b33fb35f81ffb6f7cf92de7 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 4 Dec 2023 11:31:00 -1000 Subject: [PATCH] scx_sync: Sync scheduler changes from https://github.com/sched-ext/scx --- .../sched_ext/scx_layered/src/bpf/main.bpf.c | 20 +++++++++++++++---- tools/sched_ext/scx_rusty/src/bpf/main.bpf.c | 18 +++++++++++++++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c index cd74769e952b9..98d9418e1adf1 100644 --- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c +++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c @@ -250,10 +250,20 @@ struct layer *lookup_layer(int idx) return &layers[idx]; } +/* + * Because the layer membership is by the default hierarchy cgroups rather than + * the CPU controller membership, we can't use ops.cgroup_move(). Let's iterate + * the tasks manually and set refresh_layer. + * + * The iteration isn't synchronized and may fail spuriously. It's not a big + * practical problem as process migrations are very rare in most modern systems. + * That said, we eventually want this to be based on CPU controller membership. + */ SEC("tp_btf/cgroup_attach_task") int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, struct task_struct *leader, bool threadgroup) { + struct list_head *thread_head; struct task_struct *next; struct task_ctx *tctx; int leader_pid = leader->pid; @@ -265,6 +275,8 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, if (!threadgroup) return 0; + thread_head = &leader->signal->thread_head; + if (!(next = bpf_task_acquire(leader))) { scx_bpf_error("failed to acquire leader"); return 0; @@ -274,18 +286,18 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, struct task_struct *p; int pid; - p = container_of(next->thread_group.next, struct task_struct, thread_group); + p = container_of(next->thread_node.next, struct task_struct, thread_node); bpf_task_release(next); - pid = BPF_CORE_READ(p, pid); - if (pid == leader_pid) { + if (&p->thread_node == thread_head) { next = NULL; break; } + pid = BPF_CORE_READ(p, pid); next = bpf_task_from_pid(pid); if (!next) { - scx_bpf_error("thread iteration failed"); + bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed"); break; } diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c index befaba957105e..c85e95bf372a4 100644 --- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c +++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c @@ -966,7 +966,13 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, pid_t pid; pid = p->pid; - ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST); + + /* + * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may + * fail spuriously due to BPF recursion protection triggering + * unnecessarily. + */ + ret = bpf_map_update_elem(&task_data, &pid, &taskc, 0 /*BPF_NOEXIST*/); if (ret) { stat_add(RUSTY_STAT_TASK_GET_ERR, 1); return ret; @@ -1003,7 +1009,15 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p) { pid_t pid = p->pid; - long ret = bpf_map_delete_elem(&task_data, &pid); + long ret; + + /* + * XXX - There's no reason delete should fail here but BPF's recursion + * protection can unnecessarily fail the operation. The fact that + * deletions aren't reliable means that we sometimes leak task_ctx and + * can't use BPF_NOEXIST on allocation in .prep_enable(). + */ + ret = bpf_map_delete_elem(&task_data, &pid); if (ret) { stat_add(RUSTY_STAT_TASK_GET_ERR, 1); return;