From d80b65c135a61ba481d5bef72551f6ac500ee548 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 26 Sep 2024 18:14:13 +0800 Subject: [PATCH 1/5] implement the benchmark framework --- wren-modeling-rs/Cargo.toml | 2 +- wren-modeling-rs/benchmarks/Cargo.toml | 25 ++ wren-modeling-rs/benchmarks/bench.sh | 214 ++++++++++++++++++ wren-modeling-rs/benchmarks/src/bin/tpch.rs | 17 ++ wren-modeling-rs/benchmarks/src/lib.rs | 2 + wren-modeling-rs/benchmarks/src/tpch/mod.rs | 154 +++++++++++++ wren-modeling-rs/benchmarks/src/tpch/run.rs | 83 +++++++ wren-modeling-rs/benchmarks/src/util/mod.rs | 2 + .../benchmarks/src/util/options.rs | 10 + wren-modeling-rs/benchmarks/src/util/run.rs | 130 +++++++++++ 10 files changed, 638 insertions(+), 1 deletion(-) create mode 100644 wren-modeling-rs/benchmarks/Cargo.toml create mode 100755 wren-modeling-rs/benchmarks/bench.sh create mode 100644 wren-modeling-rs/benchmarks/src/bin/tpch.rs create mode 100644 wren-modeling-rs/benchmarks/src/lib.rs create mode 100644 wren-modeling-rs/benchmarks/src/tpch/mod.rs create mode 100644 wren-modeling-rs/benchmarks/src/tpch/run.rs create mode 100644 wren-modeling-rs/benchmarks/src/util/mod.rs create mode 100644 wren-modeling-rs/benchmarks/src/util/options.rs create mode 100644 wren-modeling-rs/benchmarks/src/util/run.rs diff --git a/wren-modeling-rs/Cargo.toml b/wren-modeling-rs/Cargo.toml index f0eb4c658..c65f0e097 100644 --- a/wren-modeling-rs/Cargo.toml +++ b/wren-modeling-rs/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["core", "sqllogictest", "wren-example"] +members = ["benchmarks", "core", "sqllogictest", "wren-example"] resolver = "2" [workspace.package] diff --git a/wren-modeling-rs/benchmarks/Cargo.toml b/wren-modeling-rs/benchmarks/Cargo.toml new file mode 100644 index 000000000..3bbae3cf9 --- /dev/null +++ b/wren-modeling-rs/benchmarks/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "wren-benchmarks" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[lib] +name = "wren_benchmarks" +path = "src/lib.rs" + +[dependencies] +wren-core = { workspace = true } +datafusion = { workspace = true } +num_cpus = "1.16.0" +structopt = {version = "0.3.26", default-features = false} +tokio = {workspace = true } +env_logger = {workspace = true } +serde = {workspace = true } +serde_json = {workspace = true} +log = "0.4.21" \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/bench.sh b/wren-modeling-rs/benchmarks/bench.sh new file mode 100755 index 000000000..f3462f2d0 --- /dev/null +++ b/wren-modeling-rs/benchmarks/bench.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary in +# different configurations. + + +# Exit on error +set -e + +# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + + +# Set Defaults +COMMAND= +BENCHMARK=all +WREN_DIR=${WREN_DIR:-$SCRIPT_DIR/..} +CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"} +VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv} + +usage() { + echo " +Orchestrates running benchmarks against DataFusion checkouts + +Usage: +$0 run [benchmark] +$0 compare +$0 venv + +********** +Examples: +********** +# Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion +WREN_DIR=/source/datafusion ./bench.sh run tpch + +********** +* Commands +********** +run: Runs the named benchmark +compare: Compares results from benchmark runs +venv: Creates new venv (unless already exists) and installs compare's requirements into it + +********** +* Benchmarks +********** +all(default): Data/Run/Compare for all benchmarks +tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join + +********** +* Supported Configuration (Environment Variables) +********** +CARGO_COMMAND command that runs the benchmark binary +WREN_DIR directory to use (default $WREN_DIR) +RESULTS_NAME folder where the benchmark files are stored +VENV_PATH Python venv to use for compare and venv commands (default ./venv, override by /bin/activate) +" + exit 1 +} + +# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + # -e|--extension) + # EXTENSION="$2" + # shift # past argument + # shift # past value + # ;; + -h|--help) + shift # past argument + usage + ;; + -*) + echo "Unknown option $1" + exit 1 + ;; + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + +set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters +COMMAND=${1:-"${COMMAND}"} +ARG2=$2 +ARG3=$3 + +# Do what is requested +main() { + # Command Dispatch + case "$COMMAND" in + run) + # Parse positional parameters + BENCHMARK=${ARG2:-"${BENCHMARK}"} + BRANCH_NAME=$(cd "${WREN_DIR}" && git rev-parse --abbrev-ref HEAD) + BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ + RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"} + RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"} + + echo "***************************" + echo "DataFusion Benchmark Script" + echo "COMMAND: ${COMMAND}" + echo "BENCHMARK: ${BENCHMARK}" + echo "WREN_DIR: ${WREN_DIR}" + echo "BRANCH_NAME: ${BRANCH_NAME}" + echo "RESULTS_DIR: ${RESULTS_DIR}" + echo "CARGO_COMMAND: ${CARGO_COMMAND}" + echo "***************************" + + # navigate to the appropriate directory + pushd "${WREN_DIR}/benchmarks" > /dev/null + mkdir -p "${RESULTS_DIR}" + case "$BENCHMARK" in + all) + run_tpch "1" + ;; + tpch) + run_tpch "1" + ;; + *) + echo "Error: unknown benchmark '$BENCHMARK' for run" + usage + ;; + esac + popd > /dev/null + echo "Done" + ;; + compare) + compare_benchmarks "$ARG2" "$ARG3" + ;; + venv) + setup_venv + ;; + "") + usage + ;; + *) + echo "Error: unknown command: $COMMAND" + usage + ;; + esac +} + + +# Runs the tpch benchmark +run_tpch() { + RESULTS_FILE="${RESULTS_DIR}/tpch.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running tpch benchmark..." + $CARGO_COMMAND --bin tpch -- benchmark -i 10 -o "${RESULTS_FILE}" +} + + + + +compare_benchmarks() { + BASE_RESULTS_DIR="${SCRIPT_DIR}/results" + BRANCH1="$1" + BRANCH2="$2" + if [ -z "$BRANCH1" ] ; then + echo " not specified. Available branches:" + ls -1 "${BASE_RESULTS_DIR}" + exit 1 + fi + + if [ -z "$BRANCH2" ] ; then + echo " not specified" + ls -1 "${BASE_RESULTS_DIR}" + exit 1 + fi + + echo "Comparing ${BRANCH1} and ${BRANCH2}" + for RESULTS_FILE1 in "${BASE_RESULTS_DIR}/${BRANCH1}"/*.json ; do + BENCH=$(basename "${RESULTS_FILE1}") + RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${BENCH}" + if test -f "${RESULTS_FILE2}" ; then + echo "--------------------" + echo "Benchmark ${BENCH}" + echo "--------------------" + PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}" + else + echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist" + fi + done + +} + +setup_venv() { + python3 -m venv "$VIRTUAL_ENV" + PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt +} + +# And start the process up +main diff --git a/wren-modeling-rs/benchmarks/src/bin/tpch.rs b/wren-modeling-rs/benchmarks/src/bin/tpch.rs new file mode 100644 index 000000000..77ca01819 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/bin/tpch.rs @@ -0,0 +1,17 @@ +use datafusion::error::Result; +use structopt::StructOpt; +use wren_benchmarks::tpch; + +#[derive(Debug, StructOpt)] +#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")] +enum TpchOpt { + Benchmark(tpch::run::RunOpt), +} + +#[tokio::main] +async fn main() -> Result<()> { + env_logger::init(); + match TpchOpt::from_args() { + TpchOpt::Benchmark(opt) => opt.run().await, + } +} diff --git a/wren-modeling-rs/benchmarks/src/lib.rs b/wren-modeling-rs/benchmarks/src/lib.rs new file mode 100644 index 000000000..2e8f26292 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/lib.rs @@ -0,0 +1,2 @@ +pub mod tpch; +pub mod util; diff --git a/wren-modeling-rs/benchmarks/src/tpch/mod.rs b/wren-modeling-rs/benchmarks/src/tpch/mod.rs new file mode 100644 index 000000000..85a9316c3 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/tpch/mod.rs @@ -0,0 +1,154 @@ +use datafusion::common::{plan_err, Result}; +use std::fs; +use wren_core::mdl::builder::{ColumnBuilder, ManifestBuilder, ModelBuilder}; +use wren_core::mdl::manifest::Manifest; + +pub mod run; + +/// Get the SQL statements from the specified query file +pub fn get_query_sql(query: usize) -> Result> { + if query > 0 && query < 23 { + let possibilities = vec![ + format!("queries/tpch/q{query}.sql"), + format!("benchmarks/queries/tpch/q{query}.sql"), + ]; + let mut errors = vec![]; + for filename in possibilities { + match fs::read_to_string(&filename) { + Ok(contents) => { + return Ok(contents + .split(';') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect()); + } + Err(e) => errors.push(format!("{filename}: {e}")), + }; + } + plan_err!("invalid query. Could not find query: {:?}", errors) + } else { + plan_err!("invalid query. Expected value between 1 and 22") + } +} + +fn tpch_manifest() -> Manifest { + ManifestBuilder::new() + .model( + ModelBuilder::new("customer") + .table_reference("datafusion.public.customer") + .column(ColumnBuilder::new("c_custkey", "bigint").build()) + .column(ColumnBuilder::new("c_name", "varchar").build()) + .column(ColumnBuilder::new("c_address", "varchar").build()) + .column(ColumnBuilder::new("c_nationkey", "bigint").build()) + .column(ColumnBuilder::new("c_phone", "varchar").build()) + .column(ColumnBuilder::new("c_acctbal", "double").build()) + .column(ColumnBuilder::new("c_mktsegment", "varchar").build()) + .column(ColumnBuilder::new("c_comment", "varchar").build()) + .primary_key("c_custkey") + .build(), + ) + // Orders + .model( + ModelBuilder::new("orders") + .table_reference("datafusion.public.orders") + .column(ColumnBuilder::new("o_orderkey", "bigint").build()) + .column(ColumnBuilder::new("o_custkey", "bigint").build()) + .column(ColumnBuilder::new("o_orderstatus", "char").build()) + .column(ColumnBuilder::new("o_totalprice", "double").build()) + .column(ColumnBuilder::new("o_orderdate", "date").build()) + .column(ColumnBuilder::new("o_orderpriority", "varchar").build()) + .column(ColumnBuilder::new("o_clerk", "varchar").build()) + .column(ColumnBuilder::new("o_shippriority", "int").build()) + .column(ColumnBuilder::new("o_comment", "varchar").build()) + .primary_key("o_orderkey") + .build(), + ) + // Lineitem + .model( + ModelBuilder::new("lineitem") + .table_reference("datafusion.public.lineitem") + .column(ColumnBuilder::new("l_orderkey", "bigint").build()) + .column(ColumnBuilder::new("l_partkey", "bigint").build()) + .column(ColumnBuilder::new("l_suppkey", "bigint").build()) + .column(ColumnBuilder::new("l_linenumber", "int").build()) + .column(ColumnBuilder::new("l_quantity", "double").build()) + .column(ColumnBuilder::new("l_extendedprice", "double").build()) + .column(ColumnBuilder::new("l_discount", "double").build()) + .column(ColumnBuilder::new("l_tax", "double").build()) + .column(ColumnBuilder::new("l_returnflag", "char").build()) + .column(ColumnBuilder::new("l_linestatus", "char").build()) + .column(ColumnBuilder::new("l_shipdate", "date").build()) + .column(ColumnBuilder::new("l_commitdate", "date").build()) + .column(ColumnBuilder::new("l_receiptdate", "date").build()) + .column(ColumnBuilder::new("l_shipinstruct", "varchar").build()) + .column(ColumnBuilder::new("l_shipmode", "varchar").build()) + .column(ColumnBuilder::new("l_comment", "varchar").build()) + .primary_key("l_orderkey") + .build(), + ) + // Part + .model( + ModelBuilder::new("part") + .table_reference("datafusion.public.part") + .column(ColumnBuilder::new("p_partkey", "bigint").build()) + .column(ColumnBuilder::new("p_name", "varchar").build()) + .column(ColumnBuilder::new("p_mfgr", "varchar").build()) + .column(ColumnBuilder::new("p_brand", "varchar").build()) + .column(ColumnBuilder::new("p_type", "varchar").build()) + .column(ColumnBuilder::new("p_size", "int").build()) + .column(ColumnBuilder::new("p_container", "varchar").build()) + .column(ColumnBuilder::new("p_retailprice", "double").build()) + .column(ColumnBuilder::new("p_comment", "varchar").build()) + .primary_key("p_partkey") + .build(), + ) + // Partsupp + .model( + ModelBuilder::new("partsupp") + .table_reference("datafusion.public.partsupp") + .column(ColumnBuilder::new("ps_partkey", "bigint").build()) + .column(ColumnBuilder::new("ps_suppkey", "bigint").build()) + .column(ColumnBuilder::new("ps_availqty", "int").build()) + .column(ColumnBuilder::new("ps_supplycost", "double").build()) + .column(ColumnBuilder::new("ps_comment", "varchar").build()) + .primary_key("ps_partkey") // ps_partkey and ps_suppkey should be composite primary key + .build(), + ) + // Supplier + .model( + ModelBuilder::new("supplier") + .table_reference("datafusion.public.supplier") + .column(ColumnBuilder::new("s_suppkey", "bigint").build()) + .column(ColumnBuilder::new("s_name", "varchar").build()) + .column(ColumnBuilder::new("s_address", "varchar").build()) + .column(ColumnBuilder::new("s_nationkey", "bigint").build()) + .column(ColumnBuilder::new("s_phone", "varchar").build()) + .column(ColumnBuilder::new("s_acctbal", "double").build()) + .column(ColumnBuilder::new("s_comment", "varchar").build()) + .primary_key("s_suppkey") + .build(), + ) + // Nation + .model( + ModelBuilder::new("nation") + .table_reference("datafusion.public.nation") + .column(ColumnBuilder::new("n_nationkey", "bigint").build()) + .column(ColumnBuilder::new("n_name", "varchar").build()) + .column(ColumnBuilder::new("n_regionkey", "bigint").build()) + .column(ColumnBuilder::new("n_comment", "varchar").build()) + .primary_key("n_nationkey") + .build(), + ) + // Region + .model( + ModelBuilder::new("region") + .table_reference("datafusion.public.region") + .column(ColumnBuilder::new("r_regionkey", "bigint").build()) + .column(ColumnBuilder::new("r_name", "varchar").build()) + .column(ColumnBuilder::new("r_comment", "varchar").build()) + .primary_key("r_regionkey") + .build(), + ) + .build() +} diff --git a/wren-modeling-rs/benchmarks/src/tpch/run.rs b/wren-modeling-rs/benchmarks/src/tpch/run.rs new file mode 100644 index 000000000..d176835f6 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/tpch/run.rs @@ -0,0 +1,83 @@ +use crate::tpch::{get_query_sql, tpch_manifest}; +use crate::util::options::CommonOpt; +use crate::util::run::BenchmarkRun; +use datafusion::common::Result; +use datafusion::prelude::SessionContext; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; +use structopt::StructOpt; +use wren_core::mdl::{transform_sql_with_ctx, AnalyzedWrenMDL}; + +#[derive(Debug, StructOpt, Clone)] +#[structopt(verbatim_doc_comment)] +pub struct RunOpt { + /// Query number. If not specified, runs all queries + #[structopt(short, long)] + query: Option, + /// Common options + #[structopt(flatten)] + common: CommonOpt, + + /// Path to machine readable output file + #[structopt(parse(from_os_str), short = "o", long = "output")] + output_path: Option, +} + +const TPCH_QUERY_START_ID: usize = 1; +const TPCH_QUERY_END_ID: usize = 22; + +impl RunOpt { + pub async fn run(self) -> Result<()> { + println!("Running benchmarks with the following options: {self:?}"); + let query_range = match self.query { + Some(query_id) => query_id..=query_id, + None => TPCH_QUERY_START_ID..=TPCH_QUERY_END_ID, + }; + + let mut benchmark_run = BenchmarkRun::new(); + for query_id in query_range { + benchmark_run.start_new_case(&format!("Query {query_id}")); + let query_run = self.benchmark_query(query_id).await?; + for iter in query_run { + benchmark_run.write_iter(iter.elapsed); + } + } + benchmark_run.maybe_write_json(self.output_path.as_ref())?; + Ok(()) + } + + async fn benchmark_query(&self, query_id: usize) -> Result> { + let ctx = SessionContext::new(); + let mdl = Arc::new(AnalyzedWrenMDL::analyze(tpch_manifest())?); + let mut millis = vec![]; + // run benchmark + let mut query_results = vec![]; + for i in 0..self.iterations() { + let start = Instant::now(); + let sql = &get_query_sql(query_id)?; + for query in sql { + transform_sql_with_ctx(&ctx, Arc::clone(&mdl), query).await?; + } + + let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0; + let ms = elapsed.as_secs_f64() * 1000.0; + millis.push(ms); + println!("Query {query_id} iteration {i} took {ms:.1} ms"); + query_results.push(QueryResult { elapsed }); + } + + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {query_id} avg time: {avg:.2} ms"); + + Ok(query_results) + } + + fn iterations(&self) -> usize { + self.common.iterations + } +} + +struct QueryResult { + elapsed: std::time::Duration, +} diff --git a/wren-modeling-rs/benchmarks/src/util/mod.rs b/wren-modeling-rs/benchmarks/src/util/mod.rs new file mode 100644 index 000000000..ce5c50dc4 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/util/mod.rs @@ -0,0 +1,2 @@ +pub mod options; +pub mod run; diff --git a/wren-modeling-rs/benchmarks/src/util/options.rs b/wren-modeling-rs/benchmarks/src/util/options.rs new file mode 100644 index 000000000..fcf8e3349 --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/util/options.rs @@ -0,0 +1,10 @@ +use structopt::StructOpt; + +// Common benchmark options (don't use doc comments otherwise this doc +// shows up in help files) +#[derive(Debug, StructOpt, Clone)] +pub struct CommonOpt { + /// Number of iterations of each test run + #[structopt(short = "i", long = "iterations", default_value = "3")] + pub iterations: usize, +} diff --git a/wren-modeling-rs/benchmarks/src/util/run.rs b/wren-modeling-rs/benchmarks/src/util/run.rs new file mode 100644 index 000000000..c79617aaa --- /dev/null +++ b/wren-modeling-rs/benchmarks/src/util/run.rs @@ -0,0 +1,130 @@ +use datafusion::common::Result; +use serde::{Serialize, Serializer}; +use serde_json::Value; +use std::collections::HashMap; +use std::path::Path; +use std::time::{Duration, SystemTime}; + +fn serialize_start_time(start_time: &SystemTime, ser: S) -> Result +where + S: Serializer, +{ + ser.serialize_u64( + start_time + .duration_since(SystemTime::UNIX_EPOCH) + .expect("current time is later than UNIX_EPOCH") + .as_secs(), + ) +} +fn serialize_elapsed(elapsed: &Duration, ser: S) -> Result +where + S: Serializer, +{ + let ms = elapsed.as_secs_f64() * 1000.0; + ser.serialize_f64(ms) +} + +#[derive(Debug, Serialize)] +pub struct RunContext { + /// Benchmark crate version + pub benchmark_version: String, + /// Number of CPU cores + pub num_cpus: usize, + /// Start time + #[serde(serialize_with = "serialize_start_time")] + pub start_time: SystemTime, + /// CLI arguments + pub arguments: Vec, +} + +impl Default for RunContext { + fn default() -> Self { + Self::new() + } +} + +impl RunContext { + pub fn new() -> Self { + Self { + benchmark_version: env!("CARGO_PKG_VERSION").to_owned(), + num_cpus: num_cpus::get(), + start_time: SystemTime::now(), + arguments: std::env::args().skip(1).collect::>(), + } + } +} + +/// A single iteration of a benchmark query +#[derive(Debug, Serialize)] +struct QueryIter { + #[serde(serialize_with = "serialize_elapsed")] + elapsed: Duration, +} +/// A single benchmark case +#[derive(Debug, Serialize)] +pub struct BenchQuery { + query: String, + iterations: Vec, + #[serde(serialize_with = "serialize_start_time")] + start_time: SystemTime, +} + +pub struct BenchmarkRun { + context: RunContext, + queries: Vec, + current_case: Option, +} + +impl Default for BenchmarkRun { + fn default() -> Self { + Self::new() + } +} + +impl BenchmarkRun { + // create new + pub fn new() -> Self { + Self { + context: RunContext::new(), + queries: vec![], + current_case: None, + } + } + /// begin a new case. iterations added after this will be included in the new case + pub fn start_new_case(&mut self, id: &str) { + self.queries.push(BenchQuery { + query: id.to_owned(), + iterations: vec![], + start_time: SystemTime::now(), + }); + if let Some(c) = self.current_case.as_mut() { + *c += 1; + } else { + self.current_case = Some(0); + } + } + /// Write a new iteration to the current case + pub fn write_iter(&mut self, elapsed: Duration) { + if let Some(idx) = self.current_case { + self.queries[idx].iterations.push(QueryIter { elapsed }) + } else { + panic!("no cases existed yet"); + } + } + + /// Stringify data into formatted json + pub fn to_json(&self) -> String { + let mut output = HashMap::<&str, Value>::new(); + output.insert("context", serde_json::to_value(&self.context).unwrap()); + output.insert("queries", serde_json::to_value(&self.queries).unwrap()); + serde_json::to_string_pretty(&output).unwrap() + } + + /// Write data as json into output path if it exists. + pub fn maybe_write_json(&self, maybe_path: Option>) -> Result<()> { + if let Some(path) = maybe_path { + std::fs::write(path, self.to_json())?; + }; + Ok(()) + } +} From 33e7d557d0162018d8ef4fab6fb6ad85778ea5fe Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 26 Sep 2024 18:14:42 +0800 Subject: [PATCH 2/5] add tpch query --- .../benchmarks/queries/tpch/q1.sql | 21 +++++++++ .../benchmarks/queries/tpch/q10.sql | 31 +++++++++++++ .../benchmarks/queries/tpch/q11.sql | 27 ++++++++++++ .../benchmarks/queries/tpch/q12.sql | 30 +++++++++++++ .../benchmarks/queries/tpch/q13.sql | 20 +++++++++ .../benchmarks/queries/tpch/q14.sql | 13 ++++++ .../benchmarks/queries/tpch/q15.sql | 31 +++++++++++++ .../benchmarks/queries/tpch/q16.sql | 30 +++++++++++++ .../benchmarks/queries/tpch/q17.sql | 17 ++++++++ .../benchmarks/queries/tpch/q18.sql | 32 ++++++++++++++ .../benchmarks/queries/tpch/q19.sql | 35 +++++++++++++++ .../benchmarks/queries/tpch/q2.sql | 43 +++++++++++++++++++ .../benchmarks/queries/tpch/q20.sql | 37 ++++++++++++++++ .../benchmarks/queries/tpch/q21.sql | 39 +++++++++++++++++ .../benchmarks/queries/tpch/q22.sql | 37 ++++++++++++++++ .../benchmarks/queries/tpch/q3.sql | 22 ++++++++++ .../benchmarks/queries/tpch/q4.sql | 21 +++++++++ .../benchmarks/queries/tpch/q5.sql | 24 +++++++++++ .../benchmarks/queries/tpch/q6.sql | 9 ++++ .../benchmarks/queries/tpch/q7.sql | 39 +++++++++++++++++ .../benchmarks/queries/tpch/q8.sql | 37 ++++++++++++++++ .../benchmarks/queries/tpch/q9.sql | 32 ++++++++++++++ 22 files changed, 627 insertions(+) create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q1.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q10.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q11.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q12.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q13.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q14.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q15.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q16.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q17.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q18.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q19.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q2.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q20.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q21.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q22.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q3.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q4.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q5.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q6.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q7.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q8.sql create mode 100644 wren-modeling-rs/benchmarks/queries/tpch/q9.sql diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q1.sql b/wren-modeling-rs/benchmarks/queries/tpch/q1.sql new file mode 100644 index 000000000..a0fcf159e --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q1.sql @@ -0,0 +1,21 @@ +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-09-02' +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q10.sql b/wren-modeling-rs/benchmarks/queries/tpch/q10.sql new file mode 100644 index 000000000..cf45e4348 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q10.sql @@ -0,0 +1,31 @@ +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-10-01' + and o_orderdate < date '1994-01-01' + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q11.sql b/wren-modeling-rs/benchmarks/queries/tpch/q11.sql new file mode 100644 index 000000000..c23ed1c71 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q11.sql @@ -0,0 +1,27 @@ +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + ) +order by + value desc; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q12.sql b/wren-modeling-rs/benchmarks/queries/tpch/q12.sql new file mode 100644 index 000000000..f8e6d960c --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q12.sql @@ -0,0 +1,30 @@ +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + lineitem + join + orders + on + l_orderkey = o_orderkey +where + l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1995-01-01' +group by + l_shipmode +order by + l_shipmode; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q13.sql b/wren-modeling-rs/benchmarks/queries/tpch/q13.sql new file mode 100644 index 000000000..4bfe8c355 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q13.sql @@ -0,0 +1,20 @@ +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%special%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q14.sql b/wren-modeling-rs/benchmarks/queries/tpch/q14.sql new file mode 100644 index 000000000..d8ef6afac --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q14.sql @@ -0,0 +1,13 @@ +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-09-01' + and l_shipdate < date '1995-10-01'; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q15.sql b/wren-modeling-rs/benchmarks/queries/tpch/q15.sql new file mode 100644 index 000000000..c029105f3 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q15.sql @@ -0,0 +1,31 @@ +with revenue0 (supplier_no, total_revenue) as ( + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1996-01-01' + and l_shipdate < date '1996-01-01' + interval '3' month + group by + l_suppkey +) +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 +) +order by + s_suppkey; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q16.sql b/wren-modeling-rs/benchmarks/queries/tpch/q16.sql new file mode 100644 index 000000000..36b7c07c1 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q16.sql @@ -0,0 +1,30 @@ +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM POLISHED%' + and p_size in (49, 14, 23, 45, 19, 3, 36, 9) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' +) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q17.sql b/wren-modeling-rs/benchmarks/queries/tpch/q17.sql new file mode 100644 index 000000000..1e6555063 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q17.sql @@ -0,0 +1,17 @@ +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container = 'MED BOX' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey +); \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q18.sql b/wren-modeling-rs/benchmarks/queries/tpch/q18.sql new file mode 100644 index 000000000..835de28a5 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q18.sql @@ -0,0 +1,32 @@ +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 300 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q19.sql b/wren-modeling-rs/benchmarks/queries/tpch/q19.sql new file mode 100644 index 000000000..56668e73f --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q19.sql @@ -0,0 +1,35 @@ +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#12' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 1 and l_quantity <= 1 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 10 and l_quantity <= 10 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ); \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q2.sql b/wren-modeling-rs/benchmarks/queries/tpch/q2.sql new file mode 100644 index 000000000..f66af2102 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q2.sql @@ -0,0 +1,43 @@ +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 15 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' +) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q20.sql b/wren-modeling-rs/benchmarks/queries/tpch/q20.sql new file mode 100644 index 000000000..dd61a7d8e --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q20.sql @@ -0,0 +1,37 @@ +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'forest%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'CANADA' +order by + s_name; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q21.sql b/wren-modeling-rs/benchmarks/queries/tpch/q21.sql new file mode 100644 index 000000000..9d2fe32ce --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q21.sql @@ -0,0 +1,39 @@ +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q22.sql b/wren-modeling-rs/benchmarks/queries/tpch/q22.sql new file mode 100644 index 000000000..90aea6fd7 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q22.sql @@ -0,0 +1,37 @@ +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('13', '31', '23', '29', '30', '18', '17') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q3.sql b/wren-modeling-rs/benchmarks/queries/tpch/q3.sql new file mode 100644 index 000000000..7dbc6d9ef --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q3.sql @@ -0,0 +1,22 @@ +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-15' + and l_shipdate > date '1995-03-15' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q4.sql b/wren-modeling-rs/benchmarks/queries/tpch/q4.sql new file mode 100644 index 000000000..74a620dbc --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q4.sql @@ -0,0 +1,21 @@ +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= '1993-07-01' + and o_orderdate < date '1993-07-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q5.sql b/wren-modeling-rs/benchmarks/queries/tpch/q5.sql new file mode 100644 index 000000000..5a336b231 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q5.sql @@ -0,0 +1,24 @@ +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= date '1994-01-01' + and o_orderdate < date '1995-01-01' +group by + n_name +order by + revenue desc; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q6.sql b/wren-modeling-rs/benchmarks/queries/tpch/q6.sql new file mode 100644 index 000000000..5806f980f --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q6.sql @@ -0,0 +1,9 @@ +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1994-01-01' + and l_shipdate < date '1995-01-01' + and l_discount between 0.06 - 0.01 and 0.06 + 0.01 + and l_quantity < 24; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q7.sql b/wren-modeling-rs/benchmarks/queries/tpch/q7.sql new file mode 100644 index 000000000..512e5be55 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q7.sql @@ -0,0 +1,39 @@ +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') + or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year; diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q8.sql b/wren-modeling-rs/benchmarks/queries/tpch/q8.sql new file mode 100644 index 000000000..6ddb2a674 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q8.sql @@ -0,0 +1,37 @@ +select + o_year, + sum(case + when nation = 'BRAZIL' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'ECONOMY ANODIZED STEEL' + ) as all_nations +group by + o_year +order by + o_year; \ No newline at end of file diff --git a/wren-modeling-rs/benchmarks/queries/tpch/q9.sql b/wren-modeling-rs/benchmarks/queries/tpch/q9.sql new file mode 100644 index 000000000..587bbc8a2 --- /dev/null +++ b/wren-modeling-rs/benchmarks/queries/tpch/q9.sql @@ -0,0 +1,32 @@ +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%green%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc; \ No newline at end of file From 3fd0c79979340af38cc42305badd28dd4186a8bc Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 26 Sep 2024 18:15:16 +0800 Subject: [PATCH 3/5] add readme and compare script --- wren-modeling-rs/benchmarks/README.md | 101 ++++++++++ wren-modeling-rs/benchmarks/compare.py | 202 +++++++++++++++++++ wren-modeling-rs/benchmarks/requirements.txt | 18 ++ 3 files changed, 321 insertions(+) create mode 100644 wren-modeling-rs/benchmarks/README.md create mode 100755 wren-modeling-rs/benchmarks/compare.py create mode 100644 wren-modeling-rs/benchmarks/requirements.txt diff --git a/wren-modeling-rs/benchmarks/README.md b/wren-modeling-rs/benchmarks/README.md new file mode 100644 index 000000000..8b12477fe --- /dev/null +++ b/wren-modeling-rs/benchmarks/README.md @@ -0,0 +1,101 @@ +# Wren core benchmarks + +This crate contains the benchmarks for the Wren core library based on some open source benchmarks, to help +with performance improvements of Wren core. + +# Supported Benchmarks + +## TPCH + +Run the tpch benchmark. + +This benchmarks is derived from the [TPC-H][1] version +[2.17.1]. The data and answers are generated using `tpch-gen` from +[2]. + +[1]: http://www.tpc.org/tpch/ +[2]: https://github.com/databricks/tpch-dbgen.git, +[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf + + +# Running the benchmarks + +## `bench.sh` + +The easiest way to run benchmarks is the [bench.sh](bench.sh) +script. Usage instructions can be found with: + +```shell +# show usage +./bench.sh +``` + +## Comparing performance of main and a branch + +```shell +git checkout main + +# Gather baseline data for tpch benchmark +./benchmarks/bench.sh run tpch + +# Switch to the branch the branch name is mybranch and gather data +git checkout mybranch +./benchmarks/bench.sh run tpch + +# Compare results in the two branches: +./bench.sh compare main mybranch +``` + +This produces results like: + +```shell +Comparing main and mybranch +-------------------- +Benchmark tpch.json +-------------------- +┏━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓ +┃ Query ┃ main ┃mybranch ┃ Change ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩ +│ QQuery 1 │ 4.25ms │ 4.26ms │ no change │ +│ QQuery 2 │ 11.25ms │ 11.68ms │ no change │ +│ QQuery 3 │ 5.03ms │ 4.97ms │ no change │ +│ QQuery 4 │ 3.43ms │ 3.46ms │ no change │ +│ QQuery 5 │ 7.39ms │ 7.28ms │ no change │ +│ QQuery 6 │ 2.26ms │ 2.26ms │ no change │ +│ QQuery 7 │ 8.53ms │ 8.51ms │ no change │ +│ QQuery 8 │ 9.90ms │ 9.99ms │ no change │ +│ QQuery 9 │ 8.56ms │ 8.27ms │ no change │ +│ QQuery 10 │ 7.37ms │ 7.63ms │ no change │ +│ QQuery 11 │ 7.06ms │ 7.00ms │ no change │ +│ QQuery 12 │ 4.35ms │ 4.19ms │ no change │ +│ QQuery 13 │ 2.93ms │ 2.88ms │ no change │ +│ QQuery 14 │ 3.34ms │ 3.33ms │ no change │ +│ QQuery 15 │ 6.51ms │ 6.49ms │ no change │ +│ QQuery 16 │ 4.59ms │ 4.64ms │ no change │ +│ QQuery 17 │ 4.00ms │ 4.05ms │ no change │ +│ QQuery 18 │ 5.46ms │ 5.47ms │ no change │ +│ QQuery 19 │ 5.84ms │ 5.72ms │ no change │ +│ QQuery 20 │ 7.22ms │ 7.33ms │ no change │ +│ QQuery 21 │ 9.35ms │ 9.19ms │ no change │ +│ QQuery 22 │ 4.54ms │ 4.33ms │ no change │ +└──────────────┴─────────┴─────────┴───────────┘ +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Benchmark Summary ┃ ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ Total Time (main) │ 133.16ms │ +│ Total Time (mybranch) │ 132.92ms │ +│ Average Time (main) │ 6.05ms │ +│ Average Time (mybranch)│ 6.04ms │ +│ Queries Faster │ 0 │ +│ Queries Slower │ 0 │ +│ Queries with No Change │ 22 │ +└────────────────────────┴──────────┘ +``` + +### Running Benchmarks Manually + +The `tpch` benchmark can be run with a command like this + +```bash +cargo run --release --bin tpch -- benchmark --query 1 -i 10 -o result.json +``` diff --git a/wren-modeling-rs/benchmarks/compare.py b/wren-modeling-rs/benchmarks/compare.py new file mode 100755 index 000000000..120836942 --- /dev/null +++ b/wren-modeling-rs/benchmarks/compare.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Dict, List, Any +from pathlib import Path +from argparse import ArgumentParser + +try: + from rich.console import Console + from rich.table import Table +except ImportError: + print("Couldn't import modules -- run `./bench.sh venv` first") + raise + + +@dataclass +class QueryResult: + elapsed: float + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryResult: + return cls(elapsed=data["elapsed"]) + + +@dataclass +class QueryRun: + query: int + iterations: List[QueryResult] + start_time: int + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryRun: + return cls( + query=data["query"], + iterations=[QueryResult(**iteration) for iteration in data["iterations"]], + start_time=data["start_time"], + ) + + @property + def execution_time(self) -> float: + assert len(self.iterations) >= 1 + + # Use minimum execution time to account for variations / other + # things the system was doing + return min(iteration.elapsed for iteration in self.iterations) + + +@dataclass +class Context: + benchmark_version: str + num_cpus: int + start_time: int + arguments: List[str] + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> Context: + return cls( + benchmark_version=data["benchmark_version"], + num_cpus=data["num_cpus"], + start_time=data["start_time"], + arguments=data["arguments"], + ) + + +@dataclass +class BenchmarkRun: + context: Context + queries: List[QueryRun] + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun: + return cls( + context=Context.load_from(data["context"]), + queries=[QueryRun.load_from(result) for result in data["queries"]], + ) + + @classmethod + def load_from_file(cls, path: Path) -> BenchmarkRun: + with open(path, "r") as f: + return cls.load_from(json.load(f)) + + +def compare( + baseline_path: Path, + comparison_path: Path, + noise_threshold: float, +) -> None: + baseline = BenchmarkRun.load_from_file(baseline_path) + comparison = BenchmarkRun.load_from_file(comparison_path) + + console = Console() + + # use basename as the column names + baseline_header = baseline_path.parent.stem + comparison_header = comparison_path.parent.stem + + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Query", style="dim", width=12) + table.add_column(baseline_header, justify="right", style="dim") + table.add_column(comparison_header, justify="right", style="dim") + table.add_column("Change", justify="right", style="dim") + + faster_count = 0 + slower_count = 0 + no_change_count = 0 + total_baseline_time = 0 + total_comparison_time = 0 + + for baseline_result, comparison_result in zip(baseline.queries, comparison.queries): + assert baseline_result.query == comparison_result.query + + total_baseline_time += baseline_result.execution_time + total_comparison_time += comparison_result.execution_time + + change = comparison_result.execution_time / baseline_result.execution_time + + if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold): + change_text = "no change" + no_change_count += 1 + elif change < 1.0: + change_text = f"+{(1 / change):.2f}x faster" + faster_count += 1 + else: + change_text = f"{change:.2f}x slower" + slower_count += 1 + + table.add_row( + f"Q{baseline_result.query}", + f"{baseline_result.execution_time:.2f}ms", + f"{comparison_result.execution_time:.2f}ms", + change_text, + ) + + console.print(table) + + # Calculate averages + avg_baseline_time = total_baseline_time / len(baseline.queries) + avg_comparison_time = total_comparison_time / len(comparison.queries) + + # Summary table + summary_table = Table(show_header=True, header_style="bold magenta") + summary_table.add_column("Benchmark Summary", justify="left", style="dim") + summary_table.add_column("", justify="right", style="dim") + + summary_table.add_row(f"Total Time ({baseline_header})", f"{total_baseline_time:.2f}ms") + summary_table.add_row(f"Total Time ({comparison_header})", f"{total_comparison_time:.2f}ms") + summary_table.add_row(f"Average Time ({baseline_header})", f"{avg_baseline_time:.2f}ms") + summary_table.add_row(f"Average Time ({comparison_header})", f"{avg_comparison_time:.2f}ms") + summary_table.add_row("Queries Faster", str(faster_count)) + summary_table.add_row("Queries Slower", str(slower_count)) + summary_table.add_row("Queries with No Change", str(no_change_count)) + + console.print(summary_table) + +def main() -> None: + parser = ArgumentParser() + compare_parser = parser + compare_parser.add_argument( + "baseline_path", + type=Path, + help="Path to the baseline summary file.", + ) + compare_parser.add_argument( + "comparison_path", + type=Path, + help="Path to the comparison summary file.", + ) + compare_parser.add_argument( + "--noise-threshold", + type=float, + default=0.05, + help="The threshold for statistically insignificant results (+/- %5).", + ) + + options = parser.parse_args() + + compare(options.baseline_path, options.comparison_path, options.noise_threshold) + + + +if __name__ == "__main__": + main() diff --git a/wren-modeling-rs/benchmarks/requirements.txt b/wren-modeling-rs/benchmarks/requirements.txt new file mode 100644 index 000000000..20a5a2bdd --- /dev/null +++ b/wren-modeling-rs/benchmarks/requirements.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +rich From 54e9a7566123c986c032b5548a5231e8924b5ed5 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 26 Sep 2024 18:16:19 +0800 Subject: [PATCH 4/5] update gitignore --- wren-modeling-rs/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/wren-modeling-rs/.gitignore b/wren-modeling-rs/.gitignore index 59860a6f6..c747abfaa 100644 --- a/wren-modeling-rs/.gitignore +++ b/wren-modeling-rs/.gitignore @@ -1,3 +1,4 @@ Cargo.lock target/ sqllogictest/test_files/scratch/ +benchmarks/results/ \ No newline at end of file From 75d5745a105fd3ff3e74fc341e202693a9f1e04a Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 26 Sep 2024 18:26:31 +0800 Subject: [PATCH 5/5] taplo fmt --- wren-modeling-rs/benchmarks/Cargo.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wren-modeling-rs/benchmarks/Cargo.toml b/wren-modeling-rs/benchmarks/Cargo.toml index 3bbae3cf9..2c1f7d3ec 100644 --- a/wren-modeling-rs/benchmarks/Cargo.toml +++ b/wren-modeling-rs/benchmarks/Cargo.toml @@ -14,12 +14,12 @@ name = "wren_benchmarks" path = "src/lib.rs" [dependencies] -wren-core = { workspace = true } datafusion = { workspace = true } +env_logger = { workspace = true } +log = "0.4.21" num_cpus = "1.16.0" -structopt = {version = "0.3.26", default-features = false} -tokio = {workspace = true } -env_logger = {workspace = true } -serde = {workspace = true } -serde_json = {workspace = true} -log = "0.4.21" \ No newline at end of file +serde = { workspace = true } +serde_json = { workspace = true } +structopt = { version = "0.3.26", default-features = false } +tokio = { workspace = true } +wren-core = { workspace = true }