Canner · grieve54706 · Sep 27, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/wren-modeling-rs/.gitignore b/wren-modeling-rs/.gitignore
@@ -1,3 +1,4 @@
 Cargo.lock
 target/
 sqllogictest/test_files/scratch/
+benchmarks/results/
diff --git a/wren-modeling-rs/Cargo.toml b/wren-modeling-rs/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-members = ["core", "sqllogictest", "wren-example"]
+members = ["benchmarks", "core", "sqllogictest", "wren-example"]
 resolver = "2"
 
 [workspace.package]

diff --git a/wren-modeling-rs/benchmarks/Cargo.toml b/wren-modeling-rs/benchmarks/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "wren-benchmarks"
+authors.workspace = true
+edition.workspace = true
+homepage.workspace = true
+license.workspace = true
+readme.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+
+[lib]
+name = "wren_benchmarks"
+path = "src/lib.rs"
+
+[dependencies]
+datafusion = { workspace = true }
+env_logger = { workspace = true }
+log = "0.4.21"
+num_cpus = "1.16.0"
+serde = { workspace = true }
+serde_json = { workspace = true }
+structopt = { version = "0.3.26", default-features = false }
+tokio = { workspace = true }
+wren-core = { workspace = true }
diff --git a/wren-modeling-rs/benchmarks/README.md b/wren-modeling-rs/benchmarks/README.md
@@ -0,0 +1,101 @@
+# Wren core benchmarks
+
+This crate contains the benchmarks for the Wren core library based on some open source benchmarks, to help
+with performance improvements of Wren core.
+
+# Supported Benchmarks
+
+## TPCH
+
+Run the tpch benchmark.
+
+This benchmarks is derived from the [TPC-H][1] version
+[2.17.1]. The data and answers are generated using `tpch-gen` from
+[2].
+
+[1]: http://www.tpc.org/tpch/
+[2]: https://github.com/databricks/tpch-dbgen.git,
+[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
+
+
+# Running the benchmarks
+
+## `bench.sh`
+
+The easiest way to run benchmarks is the [bench.sh](bench.sh)
+script. Usage instructions can be found with:
+
+```shell
+# show usage
+./bench.sh
+```
+
+## Comparing performance of main and a branch
+
+```shell
+git checkout main
+
+# Gather baseline data for tpch benchmark
+./benchmarks/bench.sh run tpch
+
+# Switch to the branch the branch name is mybranch and gather data
+git checkout mybranch
+./benchmarks/bench.sh run tpch
+
+# Compare results in the two branches:
+./bench.sh compare main mybranch
+```
+
+This produces results like:
+
+```shell
+Comparing main and mybranch
+--------------------
+Benchmark tpch.json
+--------------------
+┏━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
+┃ Query        ┃    main ┃mybranch ┃    Change ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
+│ QQuery 1     │  4.25ms │  4.26ms │ no change │
+│ QQuery 2     │ 11.25ms │ 11.68ms │ no change │
+│ QQuery 3     │  5.03ms │  4.97ms │ no change │
+│ QQuery 4     │  3.43ms │  3.46ms │ no change │
+│ QQuery 5     │  7.39ms │  7.28ms │ no change │
+│ QQuery 6     │  2.26ms │  2.26ms │ no change │
+│ QQuery 7     │  8.53ms │  8.51ms │ no change │
+│ QQuery 8     │  9.90ms │  9.99ms │ no change │
+│ QQuery 9     │  8.56ms │  8.27ms │ no change │
+│ QQuery 10    │  7.37ms │  7.63ms │ no change │
+│ QQuery 11    │  7.06ms │  7.00ms │ no change │
+│ QQuery 12    │  4.35ms │  4.19ms │ no change │
+│ QQuery 13    │  2.93ms │  2.88ms │ no change │
+│ QQuery 14    │  3.34ms │  3.33ms │ no change │
+│ QQuery 15    │  6.51ms │  6.49ms │ no change │
+│ QQuery 16    │  4.59ms │  4.64ms │ no change │
+│ QQuery 17    │  4.00ms │  4.05ms │ no change │
+│ QQuery 18    │  5.46ms │  5.47ms │ no change │
+│ QQuery 19    │  5.84ms │  5.72ms │ no change │
+│ QQuery 20    │  7.22ms │  7.33ms │ no change │
+│ QQuery 21    │  9.35ms │  9.19ms │ no change │
+│ QQuery 22    │  4.54ms │  4.33ms │ no change │
+└──────────────┴─────────┴─────────┴───────────┘
+┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Benchmark Summary      ┃          ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ Total Time (main)      │ 133.16ms │
+│ Total Time (mybranch)  │ 132.92ms │
+│ Average Time (main)    │   6.05ms │
+│ Average Time (mybranch)│   6.04ms │
+│ Queries Faster         │        0 │
+│ Queries Slower         │        0 │
+│ Queries with No Change │       22 │
+└────────────────────────┴──────────┘
+```
+
+### Running Benchmarks Manually
+
+The `tpch` benchmark can be run with a command like this
+
+```bash
+cargo run --release --bin tpch -- benchmark --query 1 -i 10 -o result.json
+```
diff --git a/wren-modeling-rs/benchmarks/bench.sh b/wren-modeling-rs/benchmarks/bench.sh
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is meant for developers of DataFusion -- it is runnable
+# from the standard DataFusion development environment and uses cargo,
+# etc and orchestrates gathering data and run the benchmark binary in
+# different configurations.
+
+
+# Exit on error
+set -e
+
+# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+
+# Set Defaults
+COMMAND=
+BENCHMARK=all
+WREN_DIR=${WREN_DIR:-$SCRIPT_DIR/..}
+CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
+VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
+
+usage() {
+    echo "
+Orchestrates running benchmarks against DataFusion checkouts
+
+Usage:
+$0 run [benchmark]
+$0 compare <branch1> <branch2>
+$0 venv
+
+**********
+Examples:
+**********
+# Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion
+WREN_DIR=/source/datafusion ./bench.sh run tpch
+
+**********
+* Commands
+**********
+run:          Runs the named benchmark
+compare:      Compares results from benchmark runs
+venv:         Creates new venv (unless already exists) and installs compare's requirements into it
+
+**********
+* Benchmarks
+**********
+all(default): Data/Run/Compare for all benchmarks
+tpch:                   TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
+
+**********
+* Supported Configuration (Environment Variables)
+**********
+CARGO_COMMAND       command that runs the benchmark binary
+WREN_DIR      directory to use (default $WREN_DIR)
+RESULTS_NAME        folder where the benchmark files are stored
+VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
+"
+    exit 1
+}
+
+# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        # -e|--extension)
+        #   EXTENSION="$2"
+        #   shift # past argument
+        #   shift # past value
+        #   ;;
+        -h|--help)
+            shift # past argument
+            usage
+            ;;
+        -*)
+            echo "Unknown option $1"
+            exit 1
+            ;;
+        *)
+            POSITIONAL_ARGS+=("$1") # save positional arg
+            shift # past argument
+            ;;
+    esac
+done
+
+set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters
+COMMAND=${1:-"${COMMAND}"}
+ARG2=$2
+ARG3=$3
+
+# Do what is requested
+main() {
+    # Command Dispatch
+    case "$COMMAND" in
+        run)
+            # Parse positional parameters
+            BENCHMARK=${ARG2:-"${BENCHMARK}"}
+            BRANCH_NAME=$(cd "${WREN_DIR}" && git rev-parse --abbrev-ref HEAD)
+            BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _
+            RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"}
+            RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"}
+
+            echo "***************************"
+            echo "DataFusion Benchmark Script"
+            echo "COMMAND: ${COMMAND}"
+            echo "BENCHMARK: ${BENCHMARK}"
+            echo "WREN_DIR: ${WREN_DIR}"
+            echo "BRANCH_NAME: ${BRANCH_NAME}"
+            echo "RESULTS_DIR: ${RESULTS_DIR}"
+            echo "CARGO_COMMAND: ${CARGO_COMMAND}"
+            echo "***************************"
+
+            # navigate to the appropriate directory
+            pushd "${WREN_DIR}/benchmarks" > /dev/null
+            mkdir -p "${RESULTS_DIR}"
+            case "$BENCHMARK" in
+                all)
+                    run_tpch "1"
+                    ;;
+                tpch)
+                    run_tpch "1"
+                    ;;
+                *)
+                    echo "Error: unknown benchmark '$BENCHMARK' for run"
+                    usage
+                    ;;
+            esac
+            popd > /dev/null
+            echo "Done"
+            ;;
+        compare)
+            compare_benchmarks "$ARG2" "$ARG3"
+            ;;
+        venv)
+            setup_venv
+            ;;
+        "")
+            usage
+            ;;
+        *)
+            echo "Error: unknown command: $COMMAND"
+            usage
+            ;;
+    esac
+}
+
+
+# Runs the tpch benchmark
+run_tpch() {
+    RESULTS_FILE="${RESULTS_DIR}/tpch.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running tpch benchmark..."
+    $CARGO_COMMAND --bin tpch -- benchmark -i 10 -o "${RESULTS_FILE}"
+}
+
+
+
+
+compare_benchmarks() {
+    BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
+    BRANCH1="$1"
+    BRANCH2="$2"
+    if [ -z "$BRANCH1" ] ; then
+        echo "<branch1> not specified. Available branches:"
+        ls -1 "${BASE_RESULTS_DIR}"
+        exit 1
+    fi
+
+    if [ -z "$BRANCH2" ] ; then
+        echo "<branch2> not specified"
+        ls -1 "${BASE_RESULTS_DIR}"
+        exit 1
+    fi
+
+    echo "Comparing ${BRANCH1} and ${BRANCH2}"
+    for RESULTS_FILE1 in "${BASE_RESULTS_DIR}/${BRANCH1}"/*.json ; do
+	BENCH=$(basename "${RESULTS_FILE1}")
+        RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${BENCH}"
+        if test -f "${RESULTS_FILE2}" ; then
+            echo "--------------------"
+            echo "Benchmark ${BENCH}"
+            echo "--------------------"
+            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+        else
+            echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
+        fi
+    done
+
+}
+
+setup_venv() {
+    python3 -m venv "$VIRTUAL_ENV"
+    PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
+}
+
+# And start the process up
+main