feat: add basic profiling to benchmarks (#116)

# Problem The benchmarks currently return the number of instructions it took to execute each benchmark. While this number is useful to measure performance, it doesn't provide insight into where these instructions are being used and where the performance bottle necks are. Without this information, making informed performance optimizations would require a lot of trial and error. # Solution The typical solution to this problem is to use some kind of profiler. `ic-repl` already supports profiling and can output a flamegraph of where instructions are being spent, but it has a few drawbacks that makes it difficult to use: 1. The names of rust methods are mangled, even when `debug = 1` is turned on, making it hard to make sense of the output. 2. Each benchmark includes logic to first setup, and only after setup would we want to profile, so we'd need a way to programmatically tell the profiler to reset its measurements. 3. Often we'd like to benchmark blocks of code that aren't functions. To address the issues above, this commit introduces a "poor man profiler". This profiler is manual, in the sense that the developer adds to the code hints for what they care about profiling. In this PR, I added some basic hints, and the benchmarks now return an output that looks like this: ``` Benchmarking btreemap_insert_blob_64_1024_v2: Warming up for 1.0000 ms 2023-08-23 07:26:53.560585 UTC: [Canister rwlgt-iiaaa-aaaaa-aaaaa-cai] { "node_load_v2": "5_182_358_668 (80%)", "node_save_v2": "786_197_957 (12%)", } Benchmarking btreemap_insert_blob_64_1024_v2: Collecting 10 samples in estimated 345.63 s (165 iterations btreemap_insert_blob_64_1024_v2 time: [6474.1 M Instructions 6474.1 M Instructions 6474.1 M Instructions] change: [+0.0000% +0.0000% +0.0000%] (p = NaN > 0.05) No change in performance detected. ``` This approach is simple and effective, but it does have the draw back that it makes the instructions count slightly inaccurate, as the profiling logic itself consumes cycles. I think we can limit this inaccuracy by making the `profiler` crate internally account for its own overhead and deducting those from its measurements.
dfinity · Sep 5, 2023 · 6d00eca · 6d00eca
1 parent 4a454eb
commit 6d00eca
Show file tree

Hide file tree

Showing 11 changed files with 137 additions and 6 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,6 +11,10 @@ keywords = ["internet-computer", "dfinity", "stable-structures"]
 include = ["src", "Cargo.toml", "LICENSE", "README.md"]
 repository = "https://github.com/dfinity/stable-structures"
 
+[dependencies]
+# An optional dependency to profile parts of the code.
+profiler = { path = "./profiler", optional = true }
+
 [dev-dependencies]
 criterion = "0.4.0"
 ic-cdk = "0.6.8"

diff --git a/benches/benches.rs b/benches/benches.rs
@@ -144,6 +144,10 @@ fn execution_instructions(arguments: ExecutionArguments) -> u64 {
     let stderr = String::from_utf8(output.stderr).unwrap();
     assert!(output.status.success(), "{stdout}\n{stderr}");
 
+    // Output logs from the canister as it contains profiling data.
+    println!();
+    println!("{stderr}");
+
     // Convert result formatted as "(1_000_000 : nat64)" to u64.
     let result = stdout
         .trim()

diff --git a/benchmark-canisters/Cargo.toml b/benchmark-canisters/Cargo.toml
@@ -8,5 +8,6 @@ edition = "2021"
 [dependencies]
 ic-cdk = "0.6.8"
 ic-cdk-macros = "0.6.8"
-ic-stable-structures = { path = "../" }
+ic-stable-structures = { path = "../", features = ["profiler"] }
+profiler = { path = "../profiler" }
 tiny-rng = "0.2.0"
diff --git a/benchmark-canisters/src/main.rs b/benchmark-canisters/src/main.rs
@@ -8,8 +8,32 @@ mod vec;
 /// Returns the number of instructions consumed by the given function.
 pub(crate) fn count_instructions<R>(f: impl FnOnce() -> R) -> u64 {
     let start = ic_cdk::api::performance_counter(0);
+    profiler::reset();
     f();
-    ic_cdk::api::performance_counter(0) - start
+    let total_instructions = ic_cdk::api::performance_counter(0) - start;
+
+    let profiling_results: std::collections::BTreeMap<_, _> = profiler::get_results()
+        .into_iter()
+        .map(|(k, v)| {
+            (
+                k,
+                format!("{} ({}%)", format_num(v), ((v * 100) / total_instructions)),
+            )
+        })
+        .collect();
+    ic_cdk::api::print(&format!("{:#?}", profiling_results));
+    total_instructions
+}
+
+fn format_num(num: u64) -> String {
+    num.to_string()
+        .as_bytes()
+        .rchunks(3)
+        .rev()
+        .map(std::str::from_utf8)
+        .collect::<Result<Vec<&str>, _>>()
+        .unwrap()
+        .join("_")
 }
 
 const fn max_size<A: Storable>() -> u32 {

diff --git a/profiler/Cargo.lock b/profiler/Cargo.lock
diff --git a/profiler/Cargo.toml b/profiler/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "profiler"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ic-cdk = "0.6.8"
diff --git a/profiler/src/lib.rs b/profiler/src/lib.rs
@@ -0,0 +1,64 @@
+//! A module for profiling canisters.
+use std::cell::RefCell;
+use std::collections::BTreeMap;
+
+thread_local! {
+    static PROFILING: RefCell<BTreeMap<&'static str, u64>> = RefCell::new(BTreeMap::new());
+}
+
+/// Starts profiling the instructions consumed.
+///
+/// Instructions are counted and recorded under the given name until the
+/// `Profile` object returned is dropped.
+pub fn profile(name: &'static str) -> Profile {
+    Profile::new(name)
+}
+
+/// Clears all profiling data.
+pub fn reset() {
+    PROFILING.with(|p| p.borrow_mut().clear());
+}
+
+/// Returns the number of instructions used for each of the profile names.
+pub fn get_results() -> std::collections::BTreeMap<&'static str, u64> {
+    PROFILING.with(|p| p.borrow().clone())
+}
+
+pub struct Profile {
+    name: &'static str,
+    start_instructions: u64,
+}
+
+impl Profile {
+    fn new(name: &'static str) -> Self {
+        Self {
+            name,
+            start_instructions: instruction_count(),
+        }
+    }
+}
+
+impl Drop for Profile {
+    fn drop(&mut self) {
+        let instructions_count = instruction_count() - self.start_instructions;
+
+        PROFILING.with(|p| {
+            let mut p = p.borrow_mut();
+            let entry = p.entry(&self.name).or_insert(0);
+            *entry += instructions_count;
+        });
+    }
+}
+
+fn instruction_count() -> u64 {
+    #[cfg(target_arch = "wasm32")]
+    {
+        ic_cdk::api::performance_counter(0)
+    }
+
+    #[cfg(not(target_arch = "wasm32"))]
+    {
+        // Consider using cpu time here.
+        0
+    }
+}
diff --git a/src/btreemap.rs b/src/btreemap.rs
@@ -1309,8 +1309,6 @@ mod test {
 
             // Overwrite the value of the median key.
             assert_eq!(btree.insert(b(&[12]), b(&[1, 2, 3])), Some(b(&[])));
-
-            // The value is overwritten successfully.
             assert_eq!(btree.get(&b(&[12])), Some(b(&[1, 2, 3])));
 
             // The child has not been split and is still full.
@@ -2157,7 +2155,6 @@ mod test {
                 vec![(b(&[1, 4]), vec![]), (b(&[2, 3]), vec![])]
             );
             assert_eq!(root.children_len(), 3);
-
             let child_0 = btree.load_node(root.child(0));
             assert_eq!(child_0.node_type(), NodeType::Leaf);
             assert_eq!(

diff --git a/src/btreemap/node/v1.rs b/src/btreemap/node/v1.rs
@@ -59,6 +59,9 @@ impl<K: Storable + Ord + Clone> Node<K> {
         max_value_size: u32,
         memory: &M,
     ) -> Self {
+        #[cfg(feature = "profiler")]
+        let _p = profiler::profile("node_load_v1");
+
         // Load the header.
         let header: NodeHeader = read_struct(address, memory);
         assert_eq!(&header.magic, MAGIC, "Bad magic.");
@@ -118,6 +121,9 @@ impl<K: Storable + Ord + Clone> Node<K> {
     }
 
     pub(super) fn save_v1<M: Memory>(&self, memory: &M) {
+        #[cfg(feature = "profiler")]
+        let _p = profiler::profile("node_save_v1");
+
         match self.node_type {
             NodeType::Leaf => {
                 assert!(self.children.is_empty());

diff --git a/src/btreemap/node/v2.rs b/src/btreemap/node/v2.rs
@@ -112,6 +112,9 @@ impl<K: Storable + Ord + Clone> Node<K> {
 
     /// Loads a v2 node from memory at the given address.
     pub(super) fn load_v2<M: Memory>(address: Address, page_size: PageSize, memory: &M) -> Self {
+        #[cfg(feature = "profiler")]
+        let _p = profiler::profile("node_load_v2");
+
         // Load the node, including any overflows, into a buffer.
         let node_buf = read_node(address, page_size.get(), memory);
 
@@ -192,6 +195,9 @@ impl<K: Storable + Ord + Clone> Node<K> {
 
     // Saves the node to memory.
     pub(super) fn save_v2<M: Memory>(&self, allocator: &mut Allocator<M>) {
+        #[cfg(feature = "profiler")]
+        let _p = profiler::profile("node_save_v2");
+
         let page_size = self.version.page_size().get();
         assert!(page_size >= MINIMUM_PAGE_SIZE);
         assert_eq!(self.keys.len(), self.encoded_values.borrow().len());
@@ -255,7 +261,7 @@ impl<K: Storable + Ord + Clone> Node<K> {
     ) {
         // Compute how many overflow pages are needed.
         let additional_pages_needed = if buf.len() > page_size {
-            debug_assert!(page_size >= PAGE_OVERFLOW_DATA_OFFSET.into());
+            debug_assert!(page_size >= PAGE_OVERFLOW_DATA_OFFSET.get() as usize);
             let overflow_page_capacity = page_size - PAGE_OVERFLOW_DATA_OFFSET.get() as usize;
             let overflow_data_len = buf.len() - page_size;