From 234eb2c19d60ad472b33fb35f81ffb6f7cf92de7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 4 Dec 2023 11:31:00 -1000
Subject: [PATCH] scx_sync: Sync scheduler changes from
 https://github.com/sched-ext/scx

---
 .../sched_ext/scx_layered/src/bpf/main.bpf.c  | 20 +++++++++++++++----
 tools/sched_ext/scx_rusty/src/bpf/main.bpf.c  | 18 +++++++++++++++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index cd74769e952b9..98d9418e1adf1 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -250,10 +250,20 @@ struct layer *lookup_layer(int idx)
 	return &layers[idx];
 }
 
+/*
+ * Because the layer membership is by the default hierarchy cgroups rather than
+ * the CPU controller membership, we can't use ops.cgroup_move(). Let's iterate
+ * the tasks manually and set refresh_layer.
+ *
+ * The iteration isn't synchronized and may fail spuriously. It's not a big
+ * practical problem as process migrations are very rare in most modern systems.
+ * That said, we eventually want this to be based on CPU controller membership.
+ */
 SEC("tp_btf/cgroup_attach_task")
 int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 	     struct task_struct *leader, bool threadgroup)
 {
+	struct list_head *thread_head;
 	struct task_struct *next;
 	struct task_ctx *tctx;
 	int leader_pid = leader->pid;
@@ -265,6 +275,8 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 	if (!threadgroup)
 		return 0;
 
+	thread_head = &leader->signal->thread_head;
+
 	if (!(next = bpf_task_acquire(leader))) {
 		scx_bpf_error("failed to acquire leader");
 		return 0;
@@ -274,18 +286,18 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
 		struct task_struct *p;
 		int pid;
 
-		p = container_of(next->thread_group.next, struct task_struct, thread_group);
+		p = container_of(next->thread_node.next, struct task_struct, thread_node);
 		bpf_task_release(next);
 
-		pid = BPF_CORE_READ(p, pid);
-		if (pid == leader_pid) {
+		if (&p->thread_node == thread_head) {
 			next = NULL;
 			break;
 		}
 
+		pid = BPF_CORE_READ(p, pid);
 		next = bpf_task_from_pid(pid);
 		if (!next) {
-			scx_bpf_error("thread iteration failed");
+			bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed");
 			break;
 		}
 
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index befaba957105e..c85e95bf372a4 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -966,7 +966,13 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 	pid_t pid;
 
 	pid = p->pid;
-	ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST);
+
+	/*
+	 * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may
+	 * fail spuriously due to BPF recursion protection triggering
+	 * unnecessarily.
+	 */
+	ret = bpf_map_update_elem(&task_data, &pid, &taskc, 0 /*BPF_NOEXIST*/);
 	if (ret) {
 		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return ret;
@@ -1003,7 +1009,15 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
 {
 	pid_t pid = p->pid;
-	long ret = bpf_map_delete_elem(&task_data, &pid);
+	long ret;
+
+	/*
+	 * XXX - There's no reason delete should fail here but BPF's recursion
+	 * protection can unnecessarily fail the operation. The fact that
+	 * deletions aren't reliable means that we sometimes leak task_ctx and
+	 * can't use BPF_NOEXIST on allocation in .prep_enable().
+	 */
+	ret = bpf_map_delete_elem(&task_data, &pid);
 	if (ret) {
 		stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
 		return;