diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 869115805b2882..d6a947bc981517 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -46,6 +46,11 @@ #include #include "scx_flatcg.h" +/* + * Maximum amount of retries to find a valid cgroup. + */ +#define CGROUP_MAX_RETRIES 1024 + char _license[] SEC("license") = "GPL"; const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ @@ -302,6 +307,17 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) bpf_spin_unlock(&cgv_tree_lock); } +static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) +{ + /* + * Tell fcg_stopping() that this bypassed the regular scheduling path + * and should be force charged to the cgroup. 0 is used to indicate that + * the task isn't bypassing, so if the current runtime is 0, go back by + * one nanosecond. + */ + taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; +} + s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct fcg_task_ctx *taskc; @@ -319,35 +335,12 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake /* * If select_cpu_dfl() is recommending local enqueue, the target CPU is * idle. Follow it and charge the cgroup later in fcg_stopping() after - * the fact. Use the same mechanism to deal with tasks with custom - * affinities so that we don't have to worry about per-cgroup dq's - * containing tasks that can't be executed from some CPUs. + * the fact. */ - if (is_idle || p->nr_cpus_allowed != nr_cpus) { - /* - * Tell fcg_stopping() that this bypassed the regular scheduling - * path and should be force charged to the cgroup. 0 is used to - * indicate that the task isn't bypassing, so if the current - * runtime is 0, go back by one nanosecond. - */ - taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; - - /* - * The global dq is deprioritized as we don't want to let tasks - * to boost themselves by constraining its cpumask. The - * deprioritization is rather severe, so let's not apply that to - * per-cpu kernel threads. This is ham-fisted. We probably wanna - * implement per-cgroup fallback dq's instead so that we have - * more control over when tasks with custom cpumask get issued. - */ - if (is_idle || - (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) { - stat_inc(FCG_STAT_LOCAL); - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); - } else { - stat_inc(FCG_STAT_GLOBAL); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); - } + if (is_idle) { + set_bypassed_at(p, taskc); + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); } return cpu; @@ -365,6 +358,32 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* + * Use the direct dispatching and force charging to deal with tasks with + * custom affinities so that we don't have to worry about per-cgroup + * dq's containing tasks that can't be executed from some CPUs. + */ + if (p->nr_cpus_allowed != nr_cpus) { + set_bypassed_at(p, taskc); + + /* + * The global dq is deprioritized as we don't want to let tasks + * to boost themselves by constraining its cpumask. The + * deprioritization is rather severe, so let's not apply that to + * per-cpu kernel threads. This is ham-fisted. We probably wanna + * implement per-cgroup fallback dq's instead so that we have + * more control over when tasks with custom cpumask get issued. + */ + if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + } else { + stat_inc(FCG_STAT_GLOBAL); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + return; + } + cgrp = scx_bpf_task_cgroup(p); cgc = find_cgrp_ctx(cgrp); if (!cgc) @@ -691,6 +710,7 @@ static bool try_pick_next_cgroup(u64 *cgidp) bpf_spin_lock(&cgv_tree_lock); bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_RACE); } else { cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); if (cgv_node) { @@ -712,6 +732,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) struct fcg_cgrp_ctx *cgc; struct cgroup *cgrp; u64 now = bpf_ktime_get_ns(); + bool picked_next = false; cpuc = find_cpu_ctx(); if (!cpuc) @@ -766,10 +787,21 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) return; } - bpf_repeat(BPF_MAX_LOOPS) { - if (try_pick_next_cgroup(&cpuc->cur_cgid)) + bpf_repeat(CGROUP_MAX_RETRIES) { + if (try_pick_next_cgroup(&cpuc->cur_cgid)) { + picked_next = true; break; + } } + + /* + * This only happens if try_pick_next_cgroup() races against enqueue + * path for more than CGROUP_MAX_RETRIES times, which is extremely + * unlikely and likely indicates an underlying bug. There shouldn't be + * any stall risk as the race is against enqueue. + */ + if (!picked_next) + stat_inc(FCG_STAT_PNC_FAIL); } s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index b326b2d3ec3501..6c2f9715f69255 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -186,29 +186,31 @@ int main(int argc, char **argv) printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", seq++, cpu_util * 100.0, skel->data->hweight_gen); - printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n", + printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", stats[FCG_STAT_ACT], stats[FCG_STAT_DEACT], - stats[FCG_STAT_LOCAL], - stats[FCG_STAT_GLOBAL]); - printf("HWT skip:%6llu race:%6llu cache:%6llu update:%6llu\n", - stats[FCG_STAT_HWT_SKIP], - stats[FCG_STAT_HWT_RACE], + stats[FCG_STAT_GLOBAL], + stats[FCG_STAT_LOCAL]); + printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", stats[FCG_STAT_HWT_CACHE], - stats[FCG_STAT_HWT_UPDATES]); + stats[FCG_STAT_HWT_UPDATES], + stats[FCG_STAT_HWT_SKIP], + stats[FCG_STAT_HWT_RACE]); printf("ENQ skip:%6llu race:%6llu\n", stats[FCG_STAT_ENQ_SKIP], stats[FCG_STAT_ENQ_RACE]); - printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", + printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", stats[FCG_STAT_CNS_KEEP], stats[FCG_STAT_CNS_EXPIRE], stats[FCG_STAT_CNS_EMPTY], stats[FCG_STAT_CNS_GONE]); - printf("PNC nocgrp:%6llu next:%6llu empty:%6llu gone:%6llu\n", - stats[FCG_STAT_PNC_NO_CGRP], + printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", stats[FCG_STAT_PNC_NEXT], stats[FCG_STAT_PNC_EMPTY], - stats[FCG_STAT_PNC_GONE]); + stats[FCG_STAT_PNC_NO_CGRP], + stats[FCG_STAT_PNC_GONE], + stats[FCG_STAT_PNC_RACE], + stats[FCG_STAT_PNC_FAIL]); printf("BAD remove:%6llu\n", acc_stats[FCG_STAT_BAD_REMOVAL]); fflush(stdout); diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h index 490758ed41f0fb..6f2ea50acb1cb3 100644 --- a/tools/sched_ext/scx_flatcg.h +++ b/tools/sched_ext/scx_flatcg.h @@ -28,6 +28,8 @@ enum fcg_stat_idx { FCG_STAT_PNC_NEXT, FCG_STAT_PNC_EMPTY, FCG_STAT_PNC_GONE, + FCG_STAT_PNC_RACE, + FCG_STAT_PNC_FAIL, FCG_STAT_BAD_REMOVAL,