diff --git a/Cargo.toml b/Cargo.toml index 558f820701..a9b79d3521 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,6 +152,10 @@ malloc_counted_size = [] # Count the size of all live objects in GC count_live_bytes_in_gc = [] +# Workaround a problem where bpftrace scripts (see tools/tracing/timeline/capture.bt) cannot +# capture the type names of work packets. +bpftrace_workaround = [] + # Do not modify the following line - ci-common.sh matches it # -- Mutally exclusive features -- # Only one feature from each group can be provided. Otherwise build will fail. diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs index e415a14e46..1d850a7c1f 100644 --- a/src/scheduler/worker.rs +++ b/src/scheduler/worker.rs @@ -384,6 +384,13 @@ impl GCWorker { // probe! expands to an empty block on unsupported platforms #[allow(unused_variables)] let typename = work.get_type_name(); + + #[cfg(feature = "bpftrace_workaround")] + // Workaround a problem where bpftrace script cannot see the work packet names, + // by force loading from the packet name. + // See the "Known issues" section in `tools/tracing/timeline/README.md` + std::hint::black_box(unsafe { *(typename.as_ptr()) }); + probe!(mmtk, work, typename.as_ptr(), typename.len()); work.do_work_with_stat(self, mmtk); } diff --git a/tools/tracing/README.md b/tools/tracing/README.md index 23a633dd21..7c5584d082 100644 --- a/tools/tracing/README.md +++ b/tools/tracing/README.md @@ -1,230 +1,54 @@ -# MMTk performace tracing +# eBPF-based tracing tools ## Notes for MMTk developers + Please open pull requests if you develop new tools that others might find useful. When you add new tools, please update this documentation. If you change MMTk internals that the tracing tools depend on (such as the definition of `enum WorkBucketStage`), please update the scripts accordingly. ## Notes for MMTk users + Since some of the tools depend on the MMTk internals, please use the tools shipped with the MMTk release you use. ## Tracepoints -Currently, the core provides the following tracepoints. -- `mmtk:collection_initialized()`: GC is enabled -- `mmtk:harness_begin()`: the timing iteration of a benchmark begins -- `mmtk:harness_end()`: the timing iteration of a benchmark ends -- `mmtk:gccontroller_run()`: the GC controller thread enters its work loop -- `mmtk:gcworker_run()`: a GC worker thread enters its work loop -- `mmtk:gc_start()`: a collection epoch starts -- `mmtk:gc_end()`: a collection epoch ends -- `mmtk:process_edges(num_edges: int, is_roots: bool)`: a invocation of the -`process_edges` method. The first argument is the number of edges to be processed, -and the second argument is whether these edges are root edges. -- `mmtk:bucket_opened(id: int)`: a work bucket opened. The first argument is the -numerical representation of `enum WorkBucketStage`. -- `mmtk:work_poll()`: a work packet is to be polled. -- `mmtk:work(type_name: char *, type_name_len: int)`: a work packet was just -executed. The first argument is points to the string of the Rust type name of -the work packet, and the second argument is the length of the string. -- `mmtk:alloc_slow_once_start()`: the allocation slow path starts. -- `mmtk:alloc_slow_once_end()`: the allocation slow path ends. - -## Running tracing tools -The tracing tools are to be invoked by a wrapper script `run.py`. -``` -usage: run.py [-h] [-b BPFTRACE] -m MMTK [-H] [-p] [-f {text,json}] tool -positional arguments: - tool Name of the bpftrace tool - -optional arguments: - -h, --help show this help message and exit - -b BPFTRACE, --bpftrace BPFTRACE - Path of the bpftrace executable - -m MMTK, --mmtk MMTK Path of the MMTk binary - -H, --harness Only collect data for the timing iteration (harness_begin/harness_end) - -p, --print-script Print the content of the bpftrace script - -f {text,json}, --format {text,json} - bpftrace output format -``` - -- `-b`: the path to the `bpftrace` executable. By default, it uses `bpftrace` -executable in your `PATH`. We strongly recommend you use the latest statically -complied `bpftrace` from [upstream](https://github.com/iovisor/bpftrace/releases). -You need to be able to have sudo permission for whichever `bpftrace` you want to use. -- `-m`: the path to a MMTk binary that contains the tracepoints. -This depends on the binding you use. -For the OpenJDK binding, it should be `jdk/lib/server/libmmtk_openjdk.so` under -your build folder. -To check whether the binary contains tracepoints, you can use `readelf -n`. -You should see a bunch of `stapsdt` notes with `mmtk` as the provider. -- `-H`: pass this flag is you want to only measure the timing iteration of a -benchmark. -By default, the tracing tools will measure the entire execution. -- `-p`: print the entire tracing script before execution. -This is mainly for debugging use. -- `-f`: change the bpftrace output format. -By default, it uses human-readable plain text output (`text`). -You can set this to `json` for easy parsing. +Currently, the core provides the following tracepoints. -Please run the tracing tools **before** running the workload. -If you use `-H`, the tracing tools will automatically end with `harness_end` is -called. -Otherwise, you will need to terminate the tools manually with `Ctrl-C`. -These tools also have a timeout of 1200 seconds so not to stall unattended -benchmark execution. +- `mmtk:collection_initialized()`: GC is enabled +- `mmtk:harness_begin()`: the timing iteration of a benchmark begins +- `mmtk:harness_end()`: the timing iteration of a benchmark ends +- `mmtk:gccontroller_run()`: the GC controller thread enters its work loop +- `mmtk:gcworker_run()`: a GC worker thread enters its work loop +- `mmtk:gc_start()`: a collection epoch starts +- `mmtk:gc_end()`: a collection epoch ends +- `mmtk:process_edges(num_edges: int, is_roots: bool)`: a invocation of the `process_edges` + method. The first argument is the number of edges to be processed, and the second argument is + whether these edges are root edges. +- `mmtk:bucket_opened(id: int)`: a work bucket opened. The first argument is the numerical + representation of `enum WorkBucketStage`. +- `mmtk:work_poll()`: a work packet is to be polled. +- `mmtk:work(type_name: char *, type_name_len: int)`: a work packet was just executed. The first + argument is points to the string of the Rust type name of the work packet, and the second + argument is the length of the string. +- `mmtk:alloc_slow_once_start()`: the allocation slow path starts. +- `mmtk:alloc_slow_once_end()`: the allocation slow path ends. ## Tracing tools -### Measuring the time spend in allocation slow path (`alloc_slow`) -This tool measures the distribution of the allocation slow path time. -The time unit is 400ns, so that we use the histogram bins with higher -fidelity better. - -Sample output: -``` -@alloc_slow_hist: -[4, 8) 304 |@ | -[8, 16) 12603 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[16, 32) 8040 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[32, 64) 941 |@@@ | -[64, 128) 171 | | -[128, 256) 13 | | -[256, 512) 2 | | -[512, 1K) 0 | | -[1K, 2K) 0 | | -[2K, 4K) 0 | | -[4K, 8K) 0 | | -[8K, 16K) 0 | | -[16K, 32K) 14 | | -[32K, 64K) 37 | | -[64K, 128K) 19 | | -[128K, 256K) 1 | | -``` - -In the above output, we can see that most allocation slow paths finish between -3.2us and 6.4us. -However, there is a long tail, presumably due to GC pauses. - -### Measuring the time spend in different GC stages (`gc_stages`) -This tool measures the time spent in different stages of GC: before `Closure`, -during `Closure`, and after `Closure`. -The time unit is ns. - -Sample output: -``` -@closure_time: 1405302743 -@post_closure_time: 81432919 -@pre_closure_time: 103886118 -``` - -In the above output, overall, the execution spends 1.4s in the main transitive -closure, 103ms before that, and 81ms after that (a total of around 1.5s). - -### Measuring the time spend in lock contended state for Rust `Mutex` (`lock_contend`) -This tools measures the time spent in the lock contended state for Rust `Mutex`s. -The Rust standard library implements `Mutex` using the fast-slow-path paradigm. -Most lock operations take place in inlined fast paths, when there's no contention. -However, when there's contention, -`std::sys::unix::locks::futex_mutex::Mutex::lock_contended` is called. -```rust -#[inline] -pub fn lock(&self) { - if self.futex.compare_exchange(0, 1, Acquire, Relaxed).is_err() { - self.lock_contended(); - } -} - -#[cold] -fn lock_contended(&self) { - -} -``` - - -MMTk uses Rust `Mutex`, e.g., in allocation slow paths for synchronization, -and this tool can be useful to measure the contention in these parts of code. - -The time unit is 256ns. - -Sample output: -``` -@lock_dist[140637228007056]: -[1] 447 |@@@@ | -[2, 4) 3836 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[4, 8) 3505 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[8, 16) 1354 |@@@@@@@@@@@@@@ | -[16, 32) 832 |@@@@@@@@ | -[32, 64) 1077 |@@@@@@@@@@@ | -[64, 128) 2991 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[128, 256) 4846 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[256, 512) 5013 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[512, 1K) 1203 |@@@@@@@@@@@@ | -[1K, 2K) 34 | | -[2K, 4K) 15 | | -``` +Each sub-directory contains a set of scripts. -In the above output, we can see that the lock instance (140637228007056, or 0x7fe8a8047e90) -roughly has a bimodal distribution in terms of the time spent in lock contended -code path. -The first peak is around 512ns\~1024ns, and the second peak is around 66us\~131us. - -If you can't tell which lock instance is for which lock in MMTk, you can trace -the allocation of the Mutex and record the stack trace (note that you might want -to compile MMTk with `force-frame-pointers` to obtain better stack traces). - -### Measuring the distribution of `process_edges` packet sizes (`packet_size`) -Most of the GC time is spend in the transitive closure for tracing-based GCs, -and MMTk performs transitive closure via work packets that calls the `process_edges` method. -This tool measures the distribution of the sizes of these work packets, and also -count root edges separately. - -Sample output: -``` -@process_edges_packet_size: -[1] 238 |@@@@@ | -[2, 4) 806 |@@@@@@@@@@@@@@@@@ | -[4, 8) 1453 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[8, 16) 1105 |@@@@@@@@@@@@@@@@@@@@@@@ | -[16, 32) 2410 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[32, 64) 1317 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[64, 128) 1252 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[128, 256) 1131 |@@@@@@@@@@@@@@@@@@@@@@@@ | -[256, 512) 2017 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[512, 1K) 1270 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[1K, 2K) 1028 |@@@@@@@@@@@@@@@@@@@@@@ | -[2K, 4K) 874 |@@@@@@@@@@@@@@@@@@ | -[4K, 8K) 1024 |@@@@@@@@@@@@@@@@@@@@@@ | -[8K, 16K) 58 |@ | -[16K, 32K) 5 | | - -@process_edges_root_packet_size: -[1] 71 |@@@@@@@ | -[2, 4) 4 | | -[4, 8) 276 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[8, 16) 495 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[16, 32) 477 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[32, 64) 344 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[64, 128) 242 |@@@@@@@@@@@@@@@@@@@@@@@@@ | -[128, 256) 109 |@@@@@@@@@@@ | -[256, 512) 31 |@@@ | -[512, 1K) 33 |@@@ | -[1K, 2K) 75 |@@@@@@@ | -[2K, 4K) 75 |@@@@@@@ | -[4K, 8K) 336 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[8K, 16K) 56 |@@@@@ | -[16K, 32K) 3 | | -``` - -In the above output, we can see that overall, the sizes of the `process_edges` -has a unimodal distribution with a peak around 16\~32 edges per packet. -However, if we focus on root edges, the distribution is roughly bimodal, with a -first peak around 8\~16 and a second peak around 4096\~8192. +- `performance`: Print various GC-related statistics, such as the distribution of time spent in + allocation slow path, the time spent in each GC stages, and the distribution of `process_edges` + packet sizes. +- `timeline`: Record the start and end time of each GC and each work packet, and visualize them on + a timeline in Perfetto UI. ## Attribution + If used for research, please cite the following publication. + ```bibtex @inproceedings{DBLP:conf/pppj/HuangBC23, author = {Claire Huang and diff --git a/tools/tracing/performance/README.md b/tools/tracing/performance/README.md new file mode 100644 index 0000000000..fb689285a3 --- /dev/null +++ b/tools/tracing/performance/README.md @@ -0,0 +1,202 @@ +# MMTk performance tracing + +This directory contains tools for printing out various GC-related statistics using eBPF tracing. + +## Running + +Before running, you should make sure the [bpftrace] command line utility is installed. + +[bpftrace]: https://github.com/iovisor/bpftrace + +Tracing tools in this directory are to be invoked by a wrapper script `run.py`. + +``` +usage: run.py [-h] [-b BPFTRACE] -m MMTK [-H] [-p] [-f {text,json}] tool + +positional arguments: + tool Name of the bpftrace tool + +optional arguments: + -h, --help show this help message and exit + -b BPFTRACE, --bpftrace BPFTRACE + Path of the bpftrace executable + -m MMTK, --mmtk MMTK Path of the MMTk binary + -H, --harness Only collect data for the timing iteration (harness_begin/harness_end) + -p, --print-script Print the content of the bpftrace script + -f {text,json}, --format {text,json} + bpftrace output format +``` + +- `-b`: the path to the `bpftrace` executable. By default, it uses `bpftrace` +executable in your `PATH`. We strongly recommend you use the latest statically +complied `bpftrace` from [upstream](https://github.com/iovisor/bpftrace/releases). +You need to be able to have sudo permission for whichever `bpftrace` you want to use. +- `-m`: the path to a MMTk binary that contains the tracepoints. +This depends on the binding you use. +For the OpenJDK binding, it should be `jdk/lib/server/libmmtk_openjdk.so` under +your build folder. +To check whether the binary contains tracepoints, you can use `readelf -n`. +You should see a bunch of `stapsdt` notes with `mmtk` as the provider. +- `-H`: pass this flag is you want to only measure the timing iteration of a +benchmark. +By default, the tracing tools will measure the entire execution. +- `-p`: print the entire tracing script before execution. +This is mainly for debugging use. +- `-f`: change the bpftrace output format. +By default, it uses human-readable plain text output (`text`). +You can set this to `json` for easy parsing. + +Please run the tracing tools **before** running the workload. +If you use `-H`, the tracing tools will automatically end with `harness_end` is +called. +Otherwise, you will need to terminate the tools manually with `Ctrl-C`. +These tools also have a timeout of 1200 seconds so not to stall unattended +benchmark execution. + +## Tracing tools +### Measuring the time spend in allocation slow path (`alloc_slow`) +This tool measures the distribution of the allocation slow path time. +The time unit is 400ns, so that we use the histogram bins with higher +fidelity better. + +Sample output: +``` +@alloc_slow_hist: +[4, 8) 304 |@ | +[8, 16) 12603 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[16, 32) 8040 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[32, 64) 941 |@@@ | +[64, 128) 171 | | +[128, 256) 13 | | +[256, 512) 2 | | +[512, 1K) 0 | | +[1K, 2K) 0 | | +[2K, 4K) 0 | | +[4K, 8K) 0 | | +[8K, 16K) 0 | | +[16K, 32K) 14 | | +[32K, 64K) 37 | | +[64K, 128K) 19 | | +[128K, 256K) 1 | | +``` + +In the above output, we can see that most allocation slow paths finish between +3.2us and 6.4us. +However, there is a long tail, presumably due to GC pauses. + +### Measuring the time spend in different GC stages (`gc_stages`) +This tool measures the time spent in different stages of GC: before `Closure`, +during `Closure`, and after `Closure`. +The time unit is ns. + +Sample output: +``` +@closure_time: 1405302743 +@post_closure_time: 81432919 +@pre_closure_time: 103886118 +``` + +In the above output, overall, the execution spends 1.4s in the main transitive +closure, 103ms before that, and 81ms after that (a total of around 1.5s). + +### Measuring the time spend in lock contended state for Rust `Mutex` (`lock_contend`) +This tools measures the time spent in the lock contended state for Rust `Mutex`s. +The Rust standard library implements `Mutex` using the fast-slow-path paradigm. +Most lock operations take place in inlined fast paths, when there's no contention. +However, when there's contention, +`std::sys::unix::locks::futex_mutex::Mutex::lock_contended` is called. + +```rust +#[inline] +pub fn lock(&self) { + if self.futex.compare_exchange(0, 1, Acquire, Relaxed).is_err() { + self.lock_contended(); + } +} + +#[cold] +fn lock_contended(&self) { + +} +``` + + +MMTk uses Rust `Mutex`, e.g., in allocation slow paths for synchronization, +and this tool can be useful to measure the contention in these parts of code. + +The time unit is 256ns. + +Sample output: +``` +@lock_dist[140637228007056]: +[1] 447 |@@@@ | +[2, 4) 3836 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[4, 8) 3505 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[8, 16) 1354 |@@@@@@@@@@@@@@ | +[16, 32) 832 |@@@@@@@@ | +[32, 64) 1077 |@@@@@@@@@@@ | +[64, 128) 2991 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[128, 256) 4846 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[256, 512) 5013 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[512, 1K) 1203 |@@@@@@@@@@@@ | +[1K, 2K) 34 | | +[2K, 4K) 15 | | +``` + +In the above output, we can see that the lock instance (140637228007056, or 0x7fe8a8047e90) +roughly has a bimodal distribution in terms of the time spent in lock contended +code path. +The first peak is around 512ns\~1024ns, and the second peak is around 66us\~131us. + +If you can't tell which lock instance is for which lock in MMTk, you can trace +the allocation of the Mutex and record the stack trace (note that you might want +to compile MMTk with `force-frame-pointers` to obtain better stack traces). + +### Measuring the distribution of `process_edges` packet sizes (`packet_size`) +Most of the GC time is spend in the transitive closure for tracing-based GCs, +and MMTk performs transitive closure via work packets that calls the `process_edges` method. +This tool measures the distribution of the sizes of these work packets, and also +count root edges separately. + +Sample output: +``` +@process_edges_packet_size: +[1] 238 |@@@@@ | +[2, 4) 806 |@@@@@@@@@@@@@@@@@ | +[4, 8) 1453 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[8, 16) 1105 |@@@@@@@@@@@@@@@@@@@@@@@ | +[16, 32) 2410 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[32, 64) 1317 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[64, 128) 1252 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[128, 256) 1131 |@@@@@@@@@@@@@@@@@@@@@@@@ | +[256, 512) 2017 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[512, 1K) 1270 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[1K, 2K) 1028 |@@@@@@@@@@@@@@@@@@@@@@ | +[2K, 4K) 874 |@@@@@@@@@@@@@@@@@@ | +[4K, 8K) 1024 |@@@@@@@@@@@@@@@@@@@@@@ | +[8K, 16K) 58 |@ | +[16K, 32K) 5 | | + +@process_edges_root_packet_size: +[1] 71 |@@@@@@@ | +[2, 4) 4 | | +[4, 8) 276 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[8, 16) 495 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[16, 32) 477 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[32, 64) 344 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[64, 128) 242 |@@@@@@@@@@@@@@@@@@@@@@@@@ | +[128, 256) 109 |@@@@@@@@@@@ | +[256, 512) 31 |@@@ | +[512, 1K) 33 |@@@ | +[1K, 2K) 75 |@@@@@@@ | +[2K, 4K) 75 |@@@@@@@ | +[4K, 8K) 336 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[8K, 16K) 56 |@@@@@ | +[16K, 32K) 3 | | +``` + +In the above output, we can see that overall, the sizes of the `process_edges` +has a unimodal distribution with a peak around 16\~32 edges per packet. +However, if we focus on root edges, the distribution is roughly bimodal, with a +first peak around 8\~16 and a second peak around 4096\~8192. + diff --git a/tools/tracing/alloc_slow.bt b/tools/tracing/performance/alloc_slow.bt similarity index 100% rename from tools/tracing/alloc_slow.bt rename to tools/tracing/performance/alloc_slow.bt diff --git a/tools/tracing/epilogue.bt.fragment b/tools/tracing/performance/epilogue.bt.fragment similarity index 100% rename from tools/tracing/epilogue.bt.fragment rename to tools/tracing/performance/epilogue.bt.fragment diff --git a/tools/tracing/gc_stages.bt b/tools/tracing/performance/gc_stages.bt similarity index 100% rename from tools/tracing/gc_stages.bt rename to tools/tracing/performance/gc_stages.bt diff --git a/tools/tracing/lock_contended.bt b/tools/tracing/performance/lock_contended.bt similarity index 100% rename from tools/tracing/lock_contended.bt rename to tools/tracing/performance/lock_contended.bt diff --git a/tools/tracing/packet_size.bt b/tools/tracing/performance/packet_size.bt similarity index 100% rename from tools/tracing/packet_size.bt rename to tools/tracing/performance/packet_size.bt diff --git a/tools/tracing/prologue_with_harness.bt.fragment b/tools/tracing/performance/prologue_with_harness.bt.fragment similarity index 100% rename from tools/tracing/prologue_with_harness.bt.fragment rename to tools/tracing/performance/prologue_with_harness.bt.fragment diff --git a/tools/tracing/prologue_without_harness.bt.fragment b/tools/tracing/performance/prologue_without_harness.bt.fragment similarity index 100% rename from tools/tracing/prologue_without_harness.bt.fragment rename to tools/tracing/performance/prologue_without_harness.bt.fragment diff --git a/tools/tracing/run.py b/tools/tracing/performance/run.py similarity index 99% rename from tools/tracing/run.py rename to tools/tracing/performance/run.py index c30224e9e2..3f1e4f6a9c 100755 --- a/tools/tracing/run.py +++ b/tools/tracing/performance/run.py @@ -51,7 +51,7 @@ def main(): # the command line, Ctrl-C will be captured by bpftrace instead of the # outer Python script. The temporary file can then be cleaned up by # the END probe in bpftrace. - # + # # In theory, you can implement this via pty, but it is very finicky # and doesn't work reliably. # See also https://github.com/anupli/running-ng/commit/b74e3a13f56dd97f73432d8a391e1d6cd9db8663 diff --git a/tools/tracing/timeline/README.md b/tools/tracing/timeline/README.md new file mode 100644 index 0000000000..0bdd2a5295 --- /dev/null +++ b/tools/tracing/timeline/README.md @@ -0,0 +1,152 @@ +# MMTk GC visualization + +This directory contains tools for visualizing the execution time of each work packet on a timeline. + +## Before Running + +Before running, you should make sure the [bpftrace] command line utility is installed. + +[bpftrace]: https://github.com/iovisor/bpftrace + +## Capture a log + +Run the `./capture.py` script to capture a log. + +In this example, we use the OpenJDK binding to run the `lusearch` benchmark in the DaCapo Benchmark +Suite. + +Run the following command with a **normal** user (*not* as `root` or using `sudo`): + +```shell +./capture.py -e 50 -m /path/to/libmmtk_openjdk.so +``` + +`-e 50` means we only capture one GC in every 50 GCs because otherwise it will have to print too +much log. (Note: Printing in bpftrace is done via a fixed-size user/kernel space buffer, therefore +excessive printing will overrun the buffer and cause events to be dropped. The `-e` option helps +reducing the volume of log, thereby reducing the likelihood of buffer overrun and the time for +post-processing. If one single GC still produces too much log and overruns the buffer, the user +should consider setting the `BPFTRACE_PERF_RB_PAGES` environment variable. See the man page of +`bpftrace`.) + +Replace `/path/to/libmmtk_openjdk.so` with the actual path to the `.so` that contains MMTk and its +binding. + +Run the command and it will prompt you for root password because the script internally invokes +`sudo` to run `bpftrace`. If the specified path to the `.so` is correct, it should print something +like: + +``` +Attaching 48 probes... +``` + +Then open another terminal, and run OpenJDK with MMTk. + +```shell +/path/to/openjdk/build/linux-x86_64-normal-server-release/images/jdk/bin/java -XX:+UseThirdPartyHeap -Xm{s,x}100M -jar dacapo-23.11-chopin.jar lusearch +``` + +You should see logs showing in the terminal that runs `./capture.py`, like this: + +``` +... +WORK,E,23151,19072307551777,140188748267562 +WORK,B,23162,19072307551916,140188748272882,143 +WORK,E,23160,19072307551959,140188748279315 +WORK,B,23150,19072307552199,140188748259188,42 +WORK,B,23151,19072307552801,140188748259188,42 +WORK,E,23150,19072307553295,140188748259188 +WORK,B,23160,19072307553315,140188748259188,42 +WORK,E,23151,19072307553701,140188748259188 +WORK,E,23160,19072307554493,140188748259188 +WORK,E,23156,19072307554636,140188748272882 +WORK,B,23151,19072307554917,140188748272882,143 +WORK,B,23150,19072307555142,140188748327402,406 +WORK,E,23162,19072307555309,140188748272882 +Lost 2027780 events +``` + +Then press CTRL+C in the terminal that runs `./capture.py`. It should print additional logs and +then exit, like this: + +``` +... +@type_name[140188748468414]: mmtk::util::finalizable_processor::Finalization mybenchmark.log +``` + +Type the root password if prompted. + +Then run OpenJDK again. This time, `./capture.py` should not print anything on the console. When +the benchmark finishes, press CTRL-C to quit `./capture.py`. You should see the log content in the +log file `mybenchmark.log`. + +### `harness_begin` and `harness_end` + +If your test harness calls `memory_manager::harness_begin` and `memory_manager::harness_end` before +and after the main part of the benchmark, you can add the command line option `-H` to `./capture.py` +so that it only records work packets between those two function calls, and will automatically exit +once `harness_end` is called (i.e. You don't need to manually press CTRL-C to quit `./capture.py`). + +For the OpenJDK binding, it means you need to build the probes () +and specify the callbacks properly according to your benchmark suite. For example, + +```shell +/path/to/openjdk/build/linux-x86_64-normal-server-release/images/jdk/bin/java \ + -XX:+UseThirdPartyHeap \ + -Xm{s,x}100M \ + -Djava.library.path=/path/to/probes/out \ + -Dprobes=RustMMTk + -cp /path/to/probes/out/probes.jar:/path/to/dacapo-23.11-chopin.jar \ + Harness -c probe.DacapoChopinCallback lusearch +``` + +## Post-processing the log for visualization + +Then run `./visualize.py`. + +```shell +./visualize.py mybenchmark.log +``` + +It will produce a file named `mybenchmark.log.json.gz`. + +Then open a browser and visit Perfetto UI (), click "Open trace file" +on the left, and choose the `mybenchmark.log.json.gz` file just produced. It will process the log +in your browser and show a timeline. Zoom in to one GC, and you should see the timeline for the GC, +like this: + +![Perfetto UI timeline](./perfetto-example.png) + +## Known issues + +### "(unknonwn:xxxx)" work packet names + +When `bpftrace` reads the work packet names at the `work` USDT trace points, it sometimes sees the +string contents are all '\0'. It is likely a result of lazy mmap. The packet name is obtained by +`std::any::type_name` which is currently implemented using debug information. It is likely that the +string contents are not mmapped at the time when `bpftrace` reads it from outside the process. + +The `visualize.py` script uses the place-holder `(unknown:xxxx)` for such strings, where `xxxx` is +the addresses of the strings. + +**Enable the `bpftrace_workaround` feature** of `mmtk-core` to work around this problem. It forces +a load from the packet name before the trace point to ensure the string is mapped. It adds a tiny +overhead, so it is not enabled by default. + +See: https://github.com/mmtk/mmtk-core/issues/1020 + + diff --git a/tools/tracing/timeline/capture.bt b/tools/tracing/timeline/capture.bt new file mode 100644 index 0000000000..fa6f01d4a2 --- /dev/null +++ b/tools/tracing/timeline/capture.bt @@ -0,0 +1,82 @@ +BEGIN { + @every = $EVERY; + @harness = $HARNESS; + + @gc_count = 0; + + if (!@harness) { + //always collect data + @stats_enabled = 1; + } +} + +END { + system("rm $TMP_FILE"); +} + +usdt:$MMTK:mmtk:harness_begin { + if (@harness) { + //begin collecting data at harness_begin (start of final iteration) + @stats_enabled = 1; + } +} + +usdt:$MMTK:mmtk:harness_end { + if (@harness) { + //end data at harness_end (end of final iteration) + @stats_enabled = 0; + exit(); + } +} + +usdt:$MMTK:mmtk:gc_start { + printf("GC,B,%d,%lu\n", tid, nsecs); + @gc_count += 1; + // bpftrace warns that signed `%` operator may have undefiend behavior. + if ((uint64)@gc_count % (uint64)@every == 0 && @stats_enabled) { + @enable_print = 1; + } else { + @enable_print = 0; + } +} + +usdt:$MMTK:mmtk:gc_end { + printf("GC,E,%d,%lu\n", tid, nsecs); +} + +usdt:$MMTK:mmtk:bucket_opened { + if (@enable_print) { + printf("BUCKET_OPEN,i,%d,%lu,%lu\n", tid, nsecs, arg0); + } +} + +usdt:$MMTK:mmtk:work_poll { + if (@thread_in_work[tid]) { + if (@enable_print) { + printf("WORK,E,%d,%lu,%lu\n", tid, nsecs, @thread_work_id[tid]); + } + } +} + +usdt:$MMTK:mmtk:work { + @thread_in_work[tid] = 1; + if (!@decoded[arg0]) { + // Note: The address of the work packet name is correct, but sometimes bpftrace sees the + // characters in the packet name string to be all '\0'. + // See the "Known issues" section of README.md + @type_name[arg0] = str(arg0, arg1); + @decoded[arg0] = 1; + } + @thread_work_id[tid] = arg0; + if (@enable_print) { + printf("WORK,B,%d,%lu,%lu,%lu\n", tid, nsecs, arg0, arg1); + } +} + +usdt:$MMTK:mmtk:process_edges { + if (@enable_print) { + printf("process_edges,meta,%d,%lu,%lu,%lu\n", tid, nsecs, arg0, arg1); + } +} + +// vim: ft=bpftrace ts=4 sw=4 sts=4 et diff --git a/tools/tracing/timeline/capture.py b/tools/tracing/timeline/capture.py new file mode 100755 index 0000000000..72ac712430 --- /dev/null +++ b/tools/tracing/timeline/capture.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +from argparse import ArgumentParser +from pathlib import Path +from string import Template +import os +import sys +import tempfile + + +def get_args(): + parser = ArgumentParser( + description=""" +This script is the first part of GC visualization. It captures a trace, +recording the start and end of every GC and every work packet. +""", + epilog=""" +This script should be invoked as a normal user, but it will ask the user for +root password because it will use `sudo` to run `bpftrace`. The user should +redirect the standard output to a log file so that the log file can be post- +processed by the `./visualize.py` script. +""") + + parser.add_argument("-b", "--bpftrace", type=str, default="bpftrace", + help="Path of the bpftrace executable") + parser.add_argument("-m", "--mmtk", type=str, required=True, + help="Path of the MMTk binary") + parser.add_argument("-H", "--harness", action="store_true", + help="Only collect data for the timing iteration (harness_begin/harness_end)") + parser.add_argument("-p", "--print-script", action="store_true", + help="Print the content of the bpftrace script") + parser.add_argument("-e", "--every", metavar="N", type=int, default=1, + help="Only capture every N-th GC"), + return parser.parse_args() + + +def main(): + args = get_args() + here = Path(__file__).parent.resolve() + bpftrace_script = here / "capture.bt" + mmtk_bin = Path(args.mmtk) + + if not mmtk_bin.exists(): + raise f"MMTk binary {str(mmtk_bin)} not found." + + template = Template(bpftrace_script.read_text()) + with tempfile.NamedTemporaryFile(mode="w+t") as tmp: + content = template.safe_substitute( + EVERY=args.every, + HARNESS=int(args.harness), + MMTK=mmtk_bin, + TMP_FILE=tmp.name) + if args.print_script: + print(content) + tmp.write(content) + tmp.flush() + # We use execvp to replace the current process instead of creating + # a subprocess (or sh -c). This is so that when users invoke this from + # the command line, Ctrl-C will be captured by bpftrace instead of the + # outer Python script. The temporary file can then be cleaned up by + # the END probe in bpftrace. + # + # In theory, you can implement this via pty, but it is very finicky + # and doesn't work reliably. + # See also https://github.com/anupli/running-ng/commit/b74e3a13f56dd97f73432d8a391e1d6cd9db8663 + os.execvp("sudo", ["sudo", args.bpftrace, + "--unsafe", tmp.name]) + + +if __name__ == "__main__": + main() diff --git a/tools/tracing/timeline/perfetto-example.png b/tools/tracing/timeline/perfetto-example.png new file mode 100644 index 0000000000..a0b92df241 Binary files /dev/null and b/tools/tracing/timeline/perfetto-example.png differ diff --git a/tools/tracing/timeline/visualize.py b/tools/tracing/timeline/visualize.py new file mode 100755 index 0000000000..6395938c53 --- /dev/null +++ b/tools/tracing/timeline/visualize.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import json +import re +import sys + +RE_TYPE_ID = re.compile(r"\d+") +UNKNOWN_TYPE = "(unknown)" + +def get_args(): + parser = argparse.ArgumentParser( + description=""" +This script is the second part of GC visualization. It takes the output from +`./capture.py` as input, and format it into a JSON file suitable to be consumed +by Perfetto UI. +""") + parser.add_argument("input", type=str, help="Input file"), + return parser.parse_args() + +class LogProcessor: + def __init__(self): + self.type_id_name = {} + self.results = [] + self.start_time = None + + def process_line(self, line): + if line.startswith("@type_name"): + self.process_type_line(line) + elif "," in line: + self.process_log_line(line) + + def process_type_line(self, line): + left, right = line.split(":", 1) + type_id = int(RE_TYPE_ID.search(left).group()) + type_name = right.strip() + if type_name == "": + # bpftrace sometimes sees empty strings when using the `str` function + # See the "Known issues" section in README.md + type_name = UNKNOWN_TYPE + self.type_id_name[type_id] = type_name + + def process_log_line(self, line): + parts = line.split(",") + try: + name, be, tid, ts = parts[:4] + except: + print("Abnormal line: {}".format(line)) + raise + ts = int(ts) + rest = parts[4:] + + if not self.start_time: + self.start_time = ts + + result = { + "name": name, + "ph": be, + "tid": tid, + # https://github.com/google/perfetto/issues/274 + "ts": (ts - self.start_time) / 1000.0 + } + + match name: + case "GC": + # Put GC start/stop events in a virtual thread with tid=0 + result["tid"] = 0 + + case "BUCKET_OPEN": + result["args"] = { + "stage": int(rest[0]) + } + + case "INST": + result["args"] = { + "val": int(rest[0]) + } + + case "WORK": + result["args"] = { + "type_id": int(rest[0]) + } + + self.results.append(result) + + def resolve_results(self): + for result in self.results: + if result["name"] == "WORK": + type_id = result["args"]["type_id"] + type_name = self.type_id_name[type_id] + if type_name == UNKNOWN_TYPE: + type_name = f"(unknown:{type_id})" + result["name"] = type_name + + def output(self, outfile): + json.dump({ + "traceEvents": self.results, + }, outfile) + + +def main(): + args = get_args() + + log_processor = LogProcessor() + + print("Parsing lines...") + with open(args.input) as f: + start_time = None + + for line in f.readlines(): + line = line.strip() + + log_processor.process_line(line) + + output_name = args.input + ".json.gz" + + print("Resolving work packet type names...") + log_processor.resolve_results() + + print(f"Dumping JSON output to {output_name}") + with gzip.open(output_name, "wt") as f: + log_processor.output(f) + +if __name__ == '__main__': + main()