From 88e756003865b0be12062395b14384952e962883 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jan 2024 19:05:19 -1000 Subject: [PATCH] scx: Sync schedulers from SCX v0.1.5 (74923c6cdbc3) --- tools/sched_ext/include/scx/common.bpf.h | 3 +- tools/sched_ext/scx_central.bpf.c | 8 ++ tools/sched_ext/scx_central.c | 7 +- tools/sched_ext/scx_flatcg.bpf.c | 41 ++++-- tools/sched_ext/scx_flatcg.c | 4 +- tools/sched_ext/scx_layered/Cargo.toml | 8 +- tools/sched_ext/scx_layered/README.md | 37 ++++++ .../sched_ext/scx_layered/src/bpf/main.bpf.c | 19 +-- tools/sched_ext/scx_layered/src/main.rs | 20 +-- tools/sched_ext/scx_pair.c | 9 +- tools/sched_ext/scx_qmap.bpf.c | 10 +- tools/sched_ext/scx_qmap.c | 3 +- tools/sched_ext/scx_rusty/Cargo.toml | 8 +- tools/sched_ext/scx_rusty/README.md | 36 ++++++ tools/sched_ext/scx_rusty/src/bpf/main.bpf.c | 29 ++--- tools/sched_ext/scx_rusty/src/main.rs | 67 +++++----- tools/sched_ext/scx_simple.bpf.c | 25 ++-- tools/sched_ext/scx_userland.bpf.c | 119 +++++++++++++++--- tools/sched_ext/scx_userland.c | 81 ++++++++++-- tools/sched_ext/scx_userland.h | 2 - 20 files changed, 392 insertions(+), 144 deletions(-) create mode 100644 tools/sched_ext/scx_layered/README.md create mode 100644 tools/sched_ext/scx_rusty/README.md diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 5c503c23583685..f2336d357106e5 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -10,7 +10,7 @@ #include "vmlinux.h" #include #include -#include +#include #include "user_exit_info.h" #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ @@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 4f398249fb2cca..51ddb0a14bc610 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu) __sync_fetch_and_add(&nr_mismatches, 1); scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); bpf_task_release(p); + /* + * We might run out of dispatch buffer slots if we continue dispatching + * to the fallback DSQ, without dispatching to the local DSQ of the + * target CPU. In such a case, break the loop now as will fail the + * next dispatch operation. + */ + if (!scx_bpf_dispatch_nr_slots()) + break; continue; } diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index a3d22409e9ce53..501505001bf98b 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -103,17 +104,17 @@ int main(int argc, char **argv) while (!exit_req && !uei_exited(&skel->bss->uei)) { printf("[SEQ %llu]\n", seq++); - printf("total :%10lu local:%10lu queued:%10lu lost:%10lu\n", + printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", skel->bss->nr_total, skel->bss->nr_locals, skel->bss->nr_queued, skel->bss->nr_lost_pids); - printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n", + printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", skel->bss->nr_timers, skel->bss->nr_dispatches, skel->bss->nr_mismatches, skel->bss->nr_retries); - printf("overflow:%10lu\n", + printf("overflow:%10" PRIu64 "\n", skel->bss->nr_overflows); fflush(stdout); sleep(1); diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 84a60d7e4024ba..869115805b2882 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -123,7 +123,7 @@ struct { } task_ctx SEC(".maps"); /* gets inc'd on weight tree changes to expire the cached hweights */ -unsigned long hweight_gen = 1; +u64 hweight_gen = 1; static u64 div_round_up(u64 dividend, u64 divisor) { @@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) bpf_spin_unlock(&cgv_tree_lock); } -void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) +s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { struct fcg_task_ctx *taskc; - struct cgroup *cgrp; - struct fcg_cgrp_ctx *cgc; + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); if (!taskc) { scx_bpf_error("task_ctx lookup failed"); - return; + return cpu; } /* @@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) * affinities so that we don't have to worry about per-cgroup dq's * containing tasks that can't be executed from some CPUs. */ - if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) { + if (is_idle || p->nr_cpus_allowed != nr_cpus) { /* * Tell fcg_stopping() that this bypassed the regular scheduling * path and should be force charged to the cgroup. 0 is used to @@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) * implement per-cgroup fallback dq's instead so that we have * more control over when tasks with custom cpumask get issued. */ - if ((enq_flags & SCX_ENQ_LOCAL) || + if (is_idle || (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) { stat_inc(FCG_STAT_LOCAL); - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); } else { stat_inc(FCG_STAT_GLOBAL); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); } + } + + return cpu; +} + +void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); return; } @@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) } } -s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p, - struct scx_enable_args *args) +s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, + struct scx_init_task_args *args) { struct fcg_task_ctx *taskc; struct fcg_cgrp_ctx *cgc; @@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) SEC(".struct_ops.link") struct sched_ext_ops flatcg_ops = { + .select_cpu = (void *)fcg_select_cpu, .enqueue = (void *)fcg_enqueue, .dispatch = (void *)fcg_dispatch, .runnable = (void *)fcg_runnable, .running = (void *)fcg_running, .stopping = (void *)fcg_stopping, .quiescent = (void *)fcg_quiescent, - .prep_enable = (void *)fcg_prep_enable, + .init_task = (void *)fcg_init_task, .cgroup_set_weight = (void *)fcg_cgroup_set_weight, .cgroup_init = (void *)fcg_cgroup_init, .cgroup_exit = (void *)fcg_cgroup_exit, diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6a6e47c83ede7f..b326b2d3ec3501 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -183,7 +184,7 @@ int main(int argc, char **argv) memcpy(last_stats, acc_stats, sizeof(acc_stats)); - printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n", + printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", seq++, cpu_util * 100.0, skel->data->hweight_gen); printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n", stats[FCG_STAT_ACT], @@ -210,6 +211,7 @@ int main(int argc, char **argv) stats[FCG_STAT_PNC_GONE]); printf("BAD remove:%6llu\n", acc_stats[FCG_STAT_BAD_REMOVAL]); + fflush(stdout); nanosleep(&intv_ts, NULL); } diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml index 19dd0243a9f2a6..37a811e3807e22 100644 --- a/tools/sched_ext/scx_layered/Cargo.toml +++ b/tools/sched_ext/scx_layered/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_layered" -version = "0.0.1" +version = "0.0.4" authors = ["Tejun Heo ", "Meta"] edition = "2021" description = "Userspace scheduling with BPF for Ads" @@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } ctrlc = { version = "3.1", features = ["termination"] } fb_procfs = "0.7" lazy_static = "1.4" -libbpf-rs = "0.21" +libbpf-rs = "0.22" libc = "0.2" log = "0.4" -scx_utils = "0.3" +scx_utils = "0.5" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" simplelog = "0.12" [build-dependencies] -scx_utils = "0.3" +scx_utils = "0.5" [features] enable_backtrace = [] diff --git a/tools/sched_ext/scx_layered/README.md b/tools/sched_ext/scx_layered/README.md new file mode 100644 index 00000000000000..37c554b2354db7 --- /dev/null +++ b/tools/sched_ext/scx_layered/README.md @@ -0,0 +1,37 @@ +# scx_layered + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +A highly configurable multi-layer BPF / user space hybrid scheduler. + +scx_layered allows the user to classify tasks into multiple layers, and apply +different scheduling policies to those layers. For example, a layer could be +created of all tasks that are part of the `user.slice` cgroup slice, and a +policy could be specified that ensures that the layer is given at least 80% CPU +utilization for some subset of CPUs on the system. + +## How To Install + +Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered` + +## Typical Use Case + +scx_layered is designed to be highly customizable, and can be targeted for +specific applications. For example, if you had a high-priority service that +required priority access to all but 1 physical core to ensure acceptable p99 +latencies, you could specify that the service would get priority access to all +but 1 core on the system. If that service ends up not utilizing all of those +cores, they could be used by other layers until they're needed. + +## Production Ready? + +Yes. If tuned correctly, scx_layered should be performant across various CPU +architectures and workloads. + +That said, you may run into an issue with infeasible weights, where a task with +a very high weight may cause the scheduler to incorrectly leave cores idle +because it thinks they're necessary to accommodate the compute for a single +task. This can also happen in CFS, and should soon be addressed for +scx_layered. diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c index 98d9418e1adf14..21dd0e4cd83953 100644 --- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c +++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c @@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p, bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); } -s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p, - struct scx_enable_args *args) +s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, + struct scx_init_task_args *args) { struct task_ctx tctx_init = { .pid = p->pid, @@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p, return 0; } -void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p) -{ - s32 pid = p->pid; - - bpf_map_delete_elem(&task_ctxs, &pid); -} - -void BPF_STRUCT_OPS(layered_disable, struct task_struct *p) +void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) { struct cpu_ctx *cctx; struct task_ctx *tctx; @@ -977,9 +971,8 @@ struct sched_ext_ops layered = { .quiescent = (void *)layered_quiescent, .set_weight = (void *)layered_set_weight, .set_cpumask = (void *)layered_set_cpumask, - .prep_enable = (void *)layered_prep_enable, - .cancel_enable = (void *)layered_cancel_enable, - .disable = (void *)layered_disable, + .init_task = (void *)layered_init_task, + .exit_task = (void *)layered_exit_task, .init = (void *)layered_init, .exit = (void *)layered_exit, .name = "layered", diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs index 8f4d77db04ea9e..5b5374226f49aa 100644 --- a/tools/sched_ext/scx_layered/src/main.rs +++ b/tools/sched_ext/scx_layered/src/main.rs @@ -1122,10 +1122,10 @@ struct Scheduler<'a> { impl<'a> Scheduler<'a> { fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec) -> Result<()> { - skel.rodata().nr_layers = specs.len() as u32; + skel.rodata_mut().nr_layers = specs.len() as u32; for (spec_i, spec) in specs.iter().enumerate() { - let layer = &mut skel.bss().layers[spec_i]; + let layer = &mut skel.bss_mut().layers[spec_i]; for (or_i, or) in spec.matches.iter().enumerate() { for (and_i, and) in or.iter().enumerate() { @@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> { let mut skel = skel_builder.open().context("Failed to open BPF program")?; // Initialize skel according to @opts. - skel.rodata().debug = opts.verbose as u32; - skel.rodata().slice_ns = opts.slice_us * 1000; - skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32; - skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores; + skel.rodata_mut().debug = opts.verbose as u32; + skel.rodata_mut().slice_ns = opts.slice_us * 1000; + skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32; + skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores; for cpu in cpu_pool.all_cpus.iter_ones() { - skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8); + skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8); } Self::init_layers(&mut skel, &layer_specs)?; @@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> { { Self::update_bpf_layer_cpumask( &self.layers[idx], - &mut self.skel.bss().layers[idx], + &mut self.skel.bss_mut().layers[idx], ); updated = true; } @@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> { let nr_available_cpus = available_cpus.count_ones(); for idx in 0..self.layers.len() { let layer = &mut self.layers[idx]; - let bpf_layer = &mut self.skel.bss().layers[idx]; + let bpf_layer = &mut self.skel.bss_mut().layers[idx]; match &layer.kind { LayerKind::Open { .. } => { layer.cpus.copy_from_bitslice(&available_cpus); @@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> { } } - self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32; + self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32; for (lidx, layer) in self.layers.iter().enumerate() { self.nr_layer_cpus_min_max[lidx] = ( diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c index 693f095b8c6602..1eb30efeb0ed53 100644 --- a/tools/sched_ext/scx_pair.c +++ b/tools/sched_ext/scx_pair.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -142,18 +143,18 @@ int main(int argc, char **argv) while (!exit_req && !uei_exited(&skel->bss->uei)) { printf("[SEQ %llu]\n", seq++); - printf(" total:%10lu dispatch:%10lu missing:%10lu\n", + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", skel->bss->nr_total, skel->bss->nr_dispatched, skel->bss->nr_missing); - printf(" kicks:%10lu preemptions:%7lu\n", + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", skel->bss->nr_kicks, skel->bss->nr_preemptions); - printf(" exp:%10lu exp_wait:%10lu exp_empty:%10lu\n", + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", skel->bss->nr_exps, skel->bss->nr_exp_waits, skel->bss->nr_exp_empty); - printf("cgnext:%10lu cgcoll:%10lu cgempty:%10lu\n", + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", skel->bss->nr_cgrp_next, skel->bss->nr_cgrp_coll, skel->bss->nr_cgrp_empty); diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 831df3f644d5a8..2fb75543a1640e 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -95,8 +95,8 @@ struct { } dispatch_idx_cnt SEC(".maps"); /* Statistics */ -unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; -unsigned long nr_core_sched_execed; +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -354,8 +354,8 @@ void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args __sync_fetch_and_add(&nr_reenqueued, cnt); } -s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p, - struct scx_enable_args *args) +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) { if (p->tgid == disallow_tgid) p->scx.disallow = true; @@ -391,7 +391,7 @@ struct sched_ext_ops qmap_ops = { .dispatch = (void *)qmap_dispatch, .core_sched_before = (void *)qmap_core_sched_before, .cpu_release = (void *)qmap_cpu_release, - .prep_enable = (void *)qmap_prep_enable, + .init_task = (void *)qmap_init_task, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .flags = SCX_OPS_ENQ_LAST, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index d817115c0b0a84..7008b913864490 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -90,7 +91,7 @@ int main(int argc, char **argv) long nr_enqueued = skel->bss->nr_enqueued; long nr_dispatched = skel->bss->nr_dispatched; - printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n", + printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n", nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, skel->bss->nr_reenqueued, skel->bss->nr_dequeued, skel->bss->nr_core_sched_execed); diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml index 309643687d0c6f..a8b4231d1bde9d 100644 --- a/tools/sched_ext/scx_rusty/Cargo.toml +++ b/tools/sched_ext/scx_rusty/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scx_rusty" -version = "0.5.0" +version = "0.5.3" authors = ["Dan Schatzberg ", "Meta"] edition = "2021" description = "Userspace scheduling with BPF" @@ -13,15 +13,15 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } ctrlc = { version = "3.1", features = ["termination"] } fb_procfs = "0.7.0" hex = "0.4.3" -libbpf-rs = "0.21.0" +libbpf-rs = "0.22.0" libc = "0.2.137" log = "0.4.17" ordered-float = "3.4.0" -scx_utils = "0.3" +scx_utils = "0.5" simplelog = "0.12.0" [build-dependencies] -scx_utils = "0.3" +scx_utils = "0.5" [features] enable_backtrace = [] diff --git a/tools/sched_ext/scx_rusty/README.md b/tools/sched_ext/scx_rusty/README.md new file mode 100644 index 00000000000000..990e51aaf43b3a --- /dev/null +++ b/tools/sched_ext/scx_rusty/README.md @@ -0,0 +1,36 @@ +# scx_rusty + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the +scheduler does a simple round robin in each domain, and the user space portion +(written in Rust) calculates the load factor of each domain, and informs BPF of +how tasks should be load balanced accordingly. + +## How To Install + +Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_rusty` + +## Typical Use Case + +Rusty is designed to be flexible, and accommodate different architectures and +workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc), +as well as how Rusty should partition the system into scheduling domains, can +be tuned to achieve the optimal configuration for any given system or workload. + +## Production Ready? + +Yes. If tuned correctly, rusty should be performant across various CPU +architectures and workloads. Rusty by default creates a separate scheduling +domain per-LLC, so its default configuration may be performant as well. Note +however that scx_rusty does not yet disambiguate between LLCs in different NUMA +nodes, so it may perform better on multi-CCX machines where all the LLCs share +the same socket, as opposed to multi-socket machines. + +Note as well that you may run into an issue with infeasible weights, where a +task with a very high weight may cause the scheduler to incorrectly leave cores +idle because it thinks they're necessary to accommodate the compute for a +single task. This can also happen in CFS, and should soon be addressed for +scx_rusty. diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c index c85e95bf372a49..fe4de979f2a2df 100644 --- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c +++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c @@ -425,10 +425,13 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask)) goto enoent; - if (kthreads_local && - (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + if (p->nr_cpus_allowed == 1) { cpu = prev_cpu; - stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + if (kthreads_local && (p->flags & PF_KTHREAD)) { + stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + } else { + stat_add(RUSTY_STAT_PINNED, 1); + } goto direct; } @@ -436,7 +439,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the * local dsq of the waker. */ - if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) { + if (wake_flags & SCX_WAKE_SYNC) { struct task_struct *current = (void *)bpf_get_current_task(); if (!(BPF_CORE_READ(current, flags) & PF_EXITING) && @@ -475,13 +478,6 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, } } - /* If only one CPU is allowed, dispatch */ - if (p->nr_cpus_allowed == 1) { - stat_add(RUSTY_STAT_PINNED, 1); - cpu = prev_cpu; - goto direct; - } - has_idle_cores = !bpf_cpumask_empty(idle_smtmask); /* did @p get pulled out to a foreign domain by e.g. greedy execution? */ @@ -956,8 +952,8 @@ void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p, bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); } -s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, - struct scx_enable_args *args) +s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p, + struct scx_init_task_args *args) { struct bpf_cpumask *cpumask; struct task_ctx taskc = { .dom_active_pids_gen = -1 }; @@ -1006,7 +1002,8 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, return 0; } -void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p) +void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) { pid_t pid = p->pid; long ret; @@ -1159,8 +1156,8 @@ struct sched_ext_ops rusty = { .quiescent = (void *)rusty_quiescent, .set_weight = (void *)rusty_set_weight, .set_cpumask = (void *)rusty_set_cpumask, - .prep_enable = (void *)rusty_prep_enable, - .disable = (void *)rusty_disable, + .init_task = (void *)rusty_init_task, + .exit_task = (void *)rusty_exit_task, .init = (void *)rusty_init, .exit = (void *)rusty_exit, .name = "rusty", diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs index ff7cc9d80a7eaa..3192ee049f9f2d 100644 --- a/tools/sched_ext/scx_rusty/src/main.rs +++ b/tools/sched_ext/scx_rusty/src/main.rs @@ -187,6 +187,15 @@ fn read_total_cpu(reader: &procfs::ProcReader) -> Result { .ok_or_else(|| anyhow!("Could not read total cpu stat in proc")) } +fn sub_or_zero(curr: &u64, prev: &u64) -> u64 +{ + if let Some(res) = curr.checked_sub(*prev) { + res + } else { + 0 + } +} + fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { match (curr, prev) { ( @@ -213,14 +222,14 @@ fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { .. }, ) => { - let idle_usec = curr_idle - prev_idle; - let iowait_usec = curr_iowait - prev_iowait; - let user_usec = curr_user - prev_user; - let system_usec = curr_system - prev_system; - let nice_usec = curr_nice - prev_nice; - let irq_usec = curr_irq - prev_irq; - let softirq_usec = curr_softirq - prev_softirq; - let stolen_usec = curr_stolen - prev_stolen; + let idle_usec = sub_or_zero(curr_idle, prev_idle); + let iowait_usec = sub_or_zero(curr_iowait, prev_iowait); + let user_usec = sub_or_zero(curr_user, prev_user); + let system_usec = sub_or_zero(curr_system, prev_system); + let nice_usec = sub_or_zero(curr_nice, prev_nice); + let irq_usec = sub_or_zero(curr_irq, prev_irq); + let softirq_usec = sub_or_zero(curr_softirq, prev_softirq); + let stolen_usec = sub_or_zero(curr_stolen, prev_stolen); let busy_usec = user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; @@ -426,7 +435,7 @@ impl Tuner { .read_stat()? .cpus_map .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?; - let ti = &mut skel.bss().tune_input; + let ti = &mut skel.bss_mut().tune_input; let mut dom_nr_cpus = vec![0; self.top.nr_doms]; let mut dom_util_sum = vec![0.0; self.top.nr_doms]; @@ -620,7 +629,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> { // XXX - We can't read task_ctx inline because self.skel.bss() // borrows mutably and thus conflicts with self.skel.maps(). const MAX_PIDS: u64 = bpf_intf::consts_MAX_DOM_ACTIVE_PIDS as u64; - let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize]; + let active_pids = &mut self.skel.bss_mut().dom_active_pids[dom as usize]; let mut pids = vec![]; let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx); @@ -901,16 +910,16 @@ impl<'a> Scheduler<'a> { Topology::from_cache_level(opts.cache_level, nr_cpus)? }); - skel.rodata().nr_doms = top.nr_doms as u32; - skel.rodata().nr_cpus = top.nr_cpus as u32; + skel.rodata_mut().nr_doms = top.nr_doms as u32; + skel.rodata_mut().nr_cpus = top.nr_cpus as u32; for (cpu, dom) in top.cpu_dom.iter().enumerate() { - skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32; + skel.rodata_mut().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32; } for (dom, cpus) in top.dom_cpus.iter().enumerate() { let raw_cpus_slice = cpus.as_raw_slice(); - let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom]; + let dom_cpumask_slice = &mut skel.rodata_mut().dom_cpumasks[dom]; let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len()); left.clone_from_slice(cpus.as_raw_slice()); info!( @@ -921,13 +930,13 @@ impl<'a> Scheduler<'a> { ); } - skel.rodata().slice_ns = opts.slice_us * 1000; - skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32; - skel.rodata().kthreads_local = opts.kthreads_local; - skel.rodata().fifo_sched = opts.fifo_sched; - skel.rodata().switch_partial = opts.partial; - skel.rodata().greedy_threshold = opts.greedy_threshold; - skel.rodata().debug = opts.verbose as u32; + skel.rodata_mut().slice_ns = opts.slice_us * 1000; + skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32; + skel.rodata_mut().kthreads_local = opts.kthreads_local; + skel.rodata_mut().fifo_sched = opts.fifo_sched; + skel.rodata_mut().switch_partial = opts.partial; + skel.rodata_mut().greedy_threshold = opts.greedy_threshold; + skel.rodata_mut().debug = opts.verbose as u32; // Attach. let mut skel = skel.load().context("Failed to load BPF program")?; @@ -994,14 +1003,14 @@ impl<'a> Scheduler<'a> { guest_nice_usec: _, }, ) => { - let idle_usec = curr_idle - prev_idle; - let iowait_usec = curr_iowait - prev_iowait; - let user_usec = curr_user - prev_user; - let system_usec = curr_system - prev_system; - let nice_usec = curr_nice - prev_nice; - let irq_usec = curr_irq - prev_irq; - let softirq_usec = curr_softirq - prev_softirq; - let stolen_usec = curr_stolen - prev_stolen; + let idle_usec = sub_or_zero(curr_idle, prev_idle); + let iowait_usec = sub_or_zero(curr_iowait, prev_iowait); + let user_usec = sub_or_zero(curr_user, prev_user); + let system_usec = sub_or_zero(curr_system, prev_system); + let nice_usec = sub_or_zero(curr_nice, prev_nice); + let irq_usec = sub_or_zero(curr_irq, prev_irq); + let softirq_usec = sub_or_zero(curr_softirq, prev_softirq); + let stolen_usec = sub_or_zero(curr_stolen, prev_stolen); let busy_usec = user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c index 7485acbc4f5092..95035aa29b10e6 100644 --- a/tools/sched_ext/scx_simple.bpf.c +++ b/tools/sched_ext/scx_simple.bpf.c @@ -51,19 +51,22 @@ static inline bool vtime_before(u64 a, u64 b) return (s64)(a - b) < 0; } -void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { - /* - * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that - * running @p on its CPU directly shouldn't affect fairness. Just queue - * it on the local FIFO. - */ - if (enq_flags & SCX_ENQ_LOCAL) { + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { stat_inc(0); /* count local queueing */ - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); - return; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); } + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ stat_inc(1); /* count global queueing */ if (fifo_sched) { @@ -120,8 +123,7 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; } -void BPF_STRUCT_OPS(simple_enable, struct task_struct *p, - struct scx_enable_args *args) +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) { p->scx.dsq_vtime = vtime_now; } @@ -141,6 +143,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) SEC(".struct_ops.link") struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, .enqueue = (void *)simple_enqueue, .dispatch = (void *)simple_dispatch, .running = (void *)simple_running, diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c index f2791a6aecc8b8..4cdc3a6fb880a5 100644 --- a/tools/sched_ext/scx_userland.bpf.c +++ b/tools/sched_ext/scx_userland.bpf.c @@ -20,10 +20,14 @@ * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet */ -#include #include #include "scx_userland.h" +/* + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. + */ +#define MAX_ENQUEUED_TASKS 4096 + char _license[] SEC("license") = "GPL"; const volatile bool switch_partial; @@ -35,13 +39,24 @@ const volatile u32 num_possible_cpus = 64; /* Stats that are printed by user space. */ u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; -struct user_exit_info uei; +/* + * Number of tasks that are queued for scheduling. + * + * This number is incremented by the BPF component when a task is queued to the + * user-space scheduler and it must be decremented by the user-space scheduler + * when a task is consumed. + */ +volatile u64 nr_queued; /* - * Whether the user space scheduler needs to be scheduled due to a task being - * enqueued in user space. + * Number of tasks that are waiting for scheduling. + * + * This number must be updated by the user-space scheduler to keep track if + * there is still some scheduling work to do. */ -static bool usersched_needed; +volatile u64 nr_scheduled; + +struct user_exit_info uei; /* * The map containing tasks that are enqueued in user space from the kernel. @@ -50,7 +65,7 @@ static bool usersched_needed; */ struct { __uint(type, BPF_MAP_TYPE_QUEUE); - __uint(max_entries, USERLAND_MAX_TASKS); + __uint(max_entries, MAX_ENQUEUED_TASKS); __type(value, struct scx_userland_enqueued_task); } enqueued SEC(".maps"); @@ -61,7 +76,7 @@ struct { */ struct { __uint(type, BPF_MAP_TYPE_QUEUE); - __uint(max_entries, USERLAND_MAX_TASKS); + __uint(max_entries, MAX_ENQUEUED_TASKS); __type(value, s32); } dispatched SEC(".maps"); @@ -78,6 +93,29 @@ struct { __type(value, struct task_ctx); } task_ctx_stor SEC(".maps"); +/* + * Flag used to wake-up the user-space scheduler. + */ +static volatile u32 usersched_needed; + +/* + * Set user-space scheduler wake-up flag (equivalent to an atomic release + * operation). + */ +static void set_usersched_needed(void) +{ + __sync_fetch_and_or(&usersched_needed, 1); +} + +/* + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic + * acquire operation). + */ +static bool test_and_clear_usersched_needed(void) +{ + return __sync_fetch_and_and(&usersched_needed, 0) == 1; +} + static bool is_usersched_task(const struct task_struct *p) { return p->pid == usersched_pid; @@ -136,7 +174,6 @@ static void dispatch_user_scheduler(void) { struct task_struct *p; - usersched_needed = false; p = usersched_task(); if (p) { scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); @@ -146,9 +183,8 @@ static void dispatch_user_scheduler(void) static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) { - struct scx_userland_enqueued_task task; + struct scx_userland_enqueued_task task = {}; - memset(&task, 0, sizeof(task)); task.pid = p->pid; task.sum_exec_runtime = p->se.sum_exec_runtime; task.weight = p->scx.weight; @@ -162,7 +198,7 @@ static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } else { __sync_fetch_and_add(&nr_user_enqueues, 1); - usersched_needed = true; + set_usersched_needed(); } } @@ -191,10 +227,10 @@ void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) { - if (usersched_needed) + if (test_and_clear_usersched_needed()) dispatch_user_scheduler(); - bpf_repeat(4096) { + bpf_repeat(MAX_ENQUEUED_TASKS) { s32 pid; struct task_struct *p; @@ -215,8 +251,57 @@ void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) } } -s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p, - struct scx_enable_args *args) +/* + * A CPU is about to change its idle state. If the CPU is going idle, ensure + * that the user-space scheduler has a chance to run if there is any remaining + * work to do. + */ +void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) +{ + /* + * Don't do anything if we exit from and idle state, a CPU owner will + * be assigned in .running(). + */ + if (!idle) + return; + /* + * A CPU is now available, notify the user-space scheduler that tasks + * can be dispatched, if there is at least one task waiting to be + * scheduled, either queued (accounted in nr_queued) or scheduled + * (accounted in nr_scheduled). + * + * NOTE: nr_queued is incremented by the BPF component, more exactly in + * enqueue(), when a task is sent to the user-space scheduler, then + * the scheduler drains the queued tasks (updating nr_queued) and adds + * them to its internal data structures / state; at this point tasks + * become "scheduled" and the user-space scheduler will take care of + * updating nr_scheduled accordingly; lastly tasks will be dispatched + * and the user-space scheduler will update nr_scheduled again. + * + * Checking both counters allows to determine if there is still some + * pending work to do for the scheduler: new tasks have been queued + * since last check, or there are still tasks "queued" or "scheduled" + * since the previous user-space scheduler run. If the counters are + * both zero it is pointless to wake-up the scheduler (even if a CPU + * becomes idle), because there is nothing to do. + * + * Keep in mind that update_idle() doesn't run concurrently with the + * user-space scheduler (that is single-threaded): this function is + * naturally serialized with the user-space scheduler code, therefore + * this check here is also safe from a concurrency perspective. + */ + if (nr_queued || nr_scheduled) { + /* + * Kick the CPU to make it immediately ready to accept + * dispatched tasks. + */ + set_usersched_needed(); + scx_bpf_kick_cpu(cpu, 0); + } +} + +s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, + struct scx_init_task_args *args) { if (bpf_task_storage_get(&task_ctx_stor, p, 0, BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -254,9 +339,11 @@ struct sched_ext_ops userland_ops = { .select_cpu = (void *)userland_select_cpu, .enqueue = (void *)userland_enqueue, .dispatch = (void *)userland_dispatch, - .prep_enable = (void *)userland_prep_enable, + .update_idle = (void *)userland_update_idle, + .init_task = (void *)userland_init_task, .init = (void *)userland_init, .exit = (void *)userland_exit, + .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, .timeout_ms = 3000, .name = "userland", }; diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c index fef028a1756e0b..368acd0b38bd9e 100644 --- a/tools/sched_ext/scx_userland.c +++ b/tools/sched_ext/scx_userland.c @@ -36,6 +36,8 @@ const char help_fmt[] = "\n" "See the top-level comment in .bpf.c for more details.\n" "\n" +"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" +"\n" "Usage: %s [-b BATCH] [-p]\n" "\n" " -b BATCH The number of tasks to batch when dispatching (default: 8)\n" @@ -55,7 +57,10 @@ static struct scx_userland *skel; static struct bpf_link *ops_link; /* Stats collected in user space. */ -static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches; +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; + +/* Number of tasks currently enqueued. */ +static __u64 nr_curr_enqueued; /* The data structure containing tasks that are enqueued in user space. */ struct enqueued_task { @@ -80,13 +85,15 @@ LIST_HEAD(listhead, enqueued_task); static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); /* - * The statically allocated array of tasks. We use a statically allocated list - * here to avoid having to allocate on the enqueue path, which could cause a + * The main array of tasks. The array is allocated all at once during + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to + * dynamically allocate memory on the enqueue path, which could cause a * deadlock. A more substantive user space scheduler could e.g. provide a hook * for newly enabled tasks that are passed to the scheduler from the * .prep_enable() callback to allows the scheduler to allocate on safe paths. */ -struct enqueued_task tasks[USERLAND_MAX_TASKS]; +struct enqueued_task *tasks; +static int pid_max; static double min_vruntime; @@ -95,6 +102,41 @@ static void sigint_handler(int userland) exit_req = 1; } +static int get_pid_max(void) +{ + FILE *fp; + int pid_max; + + fp = fopen("/proc/sys/kernel/pid_max", "r"); + if (fp == NULL) { + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); + return -1; + } + if (fscanf(fp, "%d", &pid_max) != 1) { + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); + fclose(fp); + return -1; + } + fclose(fp); + + return pid_max; +} + +static int init_tasks(void) +{ + pid_max = get_pid_max(); + if (pid_max < 0) + return pid_max; + + tasks = calloc(pid_max, sizeof(*tasks)); + if (!tasks) { + fprintf(stderr, "Error allocating tasks array\n"); + return -ENOMEM; + } + + return 0; +} + static __u32 task_pid(const struct enqueued_task *task) { return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); @@ -106,8 +148,7 @@ static int dispatch_task(__s32 pid) err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); if (err) { - fprintf(stderr, "Failed to dispatch task %d\n", pid); - exit_req = 1; + nr_vruntime_failed++; } else { nr_vruntime_dispatches++; } @@ -117,7 +158,7 @@ static int dispatch_task(__s32 pid) static struct enqueued_task *get_enqueued_task(__s32 pid) { - if (pid >= USERLAND_MAX_TASKS) + if (pid >= pid_max) return NULL; return &tasks[pid]; @@ -153,6 +194,7 @@ static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) update_enqueued(curr, bpf_task); nr_vruntime_enqueues++; + nr_curr_enqueued++; /* * Enqueue the task in a vruntime-sorted list. A more optimal data @@ -186,8 +228,11 @@ static void drain_enqueued_map(void) struct scx_userland_enqueued_task task; int err; - if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { + skel->bss->nr_queued = 0; + skel->bss->nr_scheduled = nr_curr_enqueued; return; + } err = vruntime_enqueue(&task); if (err) { @@ -210,18 +255,24 @@ static void dispatch_batch(void) task = LIST_FIRST(&vruntime_head); if (!task) - return; + break; min_vruntime = task->vruntime; pid = task_pid(task); LIST_REMOVE(task, entries); err = dispatch_task(pid); if (err) { - fprintf(stderr, "Failed to dispatch task %d in %u\n", - pid, i); - return; + /* + * If we fail to dispatch, put the task back to the + * vruntime_head list and stop dispatching additional + * tasks in this batch. + */ + LIST_INSERT_HEAD(&vruntime_head, task, entries); + break; } + nr_curr_enqueued--; } + skel->bss->nr_scheduled = nr_curr_enqueued; } static void *run_stats_printer(void *arg) @@ -248,8 +299,10 @@ static void *run_stats_printer(void *arg) printf("|-----------------------|\n"); printf("| enq: %10llu |\n", nr_vruntime_enqueues); printf("| disp: %10llu |\n", nr_vruntime_dispatches); + printf("| failed: %10llu |\n", nr_vruntime_failed); printf("o-----------------------o\n"); printf("\n\n"); + fflush(stdout); sleep(1); } @@ -272,6 +325,10 @@ static void bootstrap(int argc, char **argv) }; bool switch_partial = false; + err = init_tasks(); + if (err) + exit(err); + signal(SIGINT, sigint_handler); signal(SIGTERM, sigint_handler); libbpf_set_strict_mode(LIBBPF_STRICT_ALL); diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h index 639c6809c5ffe2..684fb2dd5de960 100644 --- a/tools/sched_ext/scx_userland.h +++ b/tools/sched_ext/scx_userland.h @@ -4,8 +4,6 @@ #ifndef __SCX_USERLAND_COMMON_H #define __SCX_USERLAND_COMMON_H -#define USERLAND_MAX_TASKS 8192 - /* * An instance of a task that has been enqueued by the kernel for consumption * by a user space global scheduler thread.