risingwavelabs · kwannoel · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 13, 2024
diff --git a/src/stream/benches/stream_hash_join.py b/src/stream/benches/stream_hash_join.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
-#!/usr/bin/env python3
+# Copyright 2024 RisingWave Labs
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
-#!/usr/bin/env python3
+# Copyright 2024 RisingWave Labs
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+import json
+# Executes full benchmark for stream_hash_join runtime and memory consumption
+# Outputs a json file with the results.
+
+import subprocess
+import re
+import sys
+
+# Print header
+results = ["Amp,Workload,JoinType,Total Blocks,Total Bytes,Runtime (ns)"]
+
+# Run benchmarks and capture results
+for amp in [20_000, 40_000, 200_000, 400_000]:
+    for workload in ["NotInCache", "InCache"]:
+        for join_type in ["Inner", "LeftOuter"]:
+            # Construct the command
+            cmd_mem = f'ARGS={amp},{workload},{join_type} cargo bench --features dhat-heap --bench stream_hash_join_mem'
+            cmd_rt = f'cargo criterion --message-format json --bench stream_hash_join_rt -- hash_join_rt_{amp}_{workload}_{join_type}'
+
+            s = ""
+
+            try:
+                # Run cmd_mem and capture output
+                output = subprocess.check_output(cmd_mem, shell=True, stderr=subprocess.STDOUT, universal_newlines=True)
+
+                # Extract total blocks and bytes
+                total_blocks_match = re.search(r'max_blocks:\s*(\d+)', output)
+                total_bytes_match = re.search(r'max_bytes:\s*(\d+)', output)
+
+                if total_blocks_match and total_bytes_match:
+                    total_blocks = total_blocks_match.group(1)
+                    total_bytes = total_bytes_match.group(1)
+
+                    s+=f"{amp},{workload},{join_type},{total_blocks},{total_bytes}"
+                else:
+                    print(f"No total_blocks or total_bytes found for: Amp={amp}, Workload={workload}, JoinType={join_type}", file=sys.stderr)
+
+                # Run cmd_rt and capture output
+                json_output = subprocess.check_output(cmd_rt, shell=True, universal_newlines=True)
+                json_output = json_output.split('\n')
+                try:
+                    time_ns = json.loads(json_output[0])["typical"]["estimate"]
+                except Exception as e:
+                    print(f"could not parse {json_output[0]} due to {e}")
+                    exit(1)
+                if time_ns:
+                    s+=f",{time_ns}"
+                else:
+                    print(f"No runtime found for: Amp={amp}, Workload={workload}, JoinType={join_type}", file=sys.stderr)
+
+                results.append(s)
+
+            except subprocess.CalledProcessError as e:
+                print(f"Error running benchmark for Amp={amp}, Workload={workload}, JoinType={join_type}", file=sys.stderr)
+                print(f"Error output: {e.output}", file=sys.stderr)
+
+for result in results:
+    print(result)
diff --git a/src/stream/benches/stream_hash_join_mem.rs b/src/stream/benches/stream_hash_join_mem.rs
@@ -14,17 +14,13 @@
 
 #![feature(let_chains)]
 
-//! To run this benchmark you can use the following command:
+//! Specify the amplification_size,workload,join_type e.g. 40000
 //! ```sh
-//! cargo bench --features dhat-heap --bench stream_hash_join
-//! ```
-//!
-//! You may also specify the amplification size, e.g. 40000
-//! ```sh
-//! cargo bench --features dhat-heap --bench stream_hash_join -- 40000
+//! ARGS=40000,NotInCache,Inner cargo bench --features dhat-heap --bench stream_hash_join_mem
 //! ```
 
 use std::env;
+use risingwave_pb::plan_common::JoinType;
 
 use risingwave_stream::executor::test_utils::hash_join_executor::*;
 
@@ -36,20 +32,35 @@ static ALLOC: dhat::Alloc = dhat::Alloc;
 
 #[tokio::main]
 async fn main() {
-    let args: Vec<_> = env::args().collect();
-    let amp = if let Some(raw_arg) = args.get(1)
-        && let Ok(arg) = raw_arg.parse()
+    let arg = env::var("ARGS");
+    let (amp, workload, join_type) = if let Ok(raw_arg) = arg
     {
-        arg
+        let parts = raw_arg.split(',').collect::<Vec<_>>();
+        let amp = parts[0].parse::<usize>().expect(format!("invalid amplification_size: {}", parts[0]).as_str());
+        let workload = match parts[1] {
+            "NotInCache" => HashJoinWorkload::NotInCache,
+            "InCache" => HashJoinWorkload::InCache,
+            _ => panic!("Invalid workload: {}", parts[1]),
+        };
+        let join_type = match parts[2] {
+            "Inner" => JoinType::Inner,
+            "LeftOuter" => JoinType::LeftOuter,
+            _ => panic!("Invalid join type: {}", parts[2]),
+        };
+        (amp, workload, join_type)
     } else {
-        100_000
+        panic!("invalid ARGS: {:?}", arg);
     };
-    let (tx_l, tx_r, out) = setup_bench_stream_hash_join(amp).await;
+
+    let (tx_l, tx_r, out) = setup_bench_stream_hash_join(amp, workload, join_type).await;
     {
         // Start the profiler later, after we have ingested the data for hash join build-side.
         #[cfg(feature = "dhat-heap")]
         let _profiler = dhat::Profiler::new_heap();
 
-        handle_streams(amp, tx_l, tx_r, out).await;
+        handle_streams(workload, join_type, amp, tx_l, tx_r, out).await;
+        let stats= dhat::HeapStats::get();
+        println!("max_blocks: {}", stats.max_blocks);
+        println!("max_bytes: {}", stats.max_bytes);
     }
 }
diff --git a/src/stream/benches/stream_hash_join_rt.rs b/src/stream/benches/stream_hash_join_rt.rs
@@ -23,6 +23,7 @@ use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
 use futures::executor::block_on;
 use risingwave_stream::executor::test_utils::hash_join_executor::*;
 use tokio::runtime::Runtime;
+use risingwave_pb::plan_common::JoinType;
 
 risingwave_expr_impl::enable!();
 
@@ -32,14 +33,18 @@ fn bench_hash_join(c: &mut Criterion) {
 
     let rt = Runtime::new().unwrap();
     for amp in [10_000, 20_000, 30_000, 40_000, 100_000, 200_000, 400_000] {
-        let name = format!("hash_join_rt_{}", amp);
-        group.bench_function(&name, |b| {
-            b.to_async(&rt).iter_batched(
-                || block_on(setup_bench_stream_hash_join(amp)),
-                |(tx_l, tx_r, out)| handle_streams(amp, tx_l, tx_r, out),
-                BatchSize::SmallInput,
-            )
-        });
+        for workload in [HashJoinWorkload::NotInCache, HashJoinWorkload::InCache] {
+            for join_type in [JoinType::Inner, JoinType::LeftOuter] {
+                let name = format!("hash_join_rt_{}_{}_{:#?}", amp, workload, join_type);
+                group.bench_function(&name, |b| {
+                    b.to_async(&rt).iter_batched(
+                        || block_on(setup_bench_stream_hash_join(amp, workload, join_type)),
+                        |(tx_l, tx_r, out)| handle_streams(workload, join_type, amp, tx_l, tx_r, out),
+                        BatchSize::SmallInput,
+                    )
+                });
+            }
+        }
     }
 }