From d814701c7d976966cec6c5f6f530efd2200102a3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:04:25 -0700 Subject: [PATCH 1/6] add temporal expression benchmarks --- microbenchmarks/microbenchmarks.py | 122 +++++++---------------------- microbenchmarks/suites/__init__.py | 47 +++++++++++ microbenchmarks/suites/strings.py | 74 +++++++++++++++++ microbenchmarks/suites/temporal.py | 91 +++++++++++++++++++++ 4 files changed, 240 insertions(+), 94 deletions(-) create mode 100644 microbenchmarks/suites/__init__.py create mode 100644 microbenchmarks/suites/strings.py create mode 100644 microbenchmarks/suites/temporal.py diff --git a/microbenchmarks/microbenchmarks.py b/microbenchmarks/microbenchmarks.py index c57483d..c904307 100755 --- a/microbenchmarks/microbenchmarks.py +++ b/microbenchmarks/microbenchmarks.py @@ -1,20 +1,20 @@ #!/usr/bin/env python3 """ Microbenchmark comparing DataFusion and DuckDB performance -for SQL string functions on Parquet files. +for various SQL functions on Parquet files. """ import tempfile import time import os from dataclasses import dataclass -from pathlib import Path -import pyarrow as pa import pyarrow.parquet as pq import datafusion import duckdb +from suites import get_suite, list_suites, Suite + @dataclass class BenchmarkResult: @@ -32,85 +32,6 @@ def speedup(self) -> float: return self.duckdb_time_ms / self.datafusion_time_ms -@dataclass -class StringFunction: - """Defines a string function with syntax for both engines.""" - name: str - datafusion_expr: str # Expression using {col} as placeholder for column name - duckdb_expr: str # Expression using {col} as placeholder for column name - - -# String functions to benchmark -# {col} will be replaced with the actual column name -STRING_FUNCTIONS = [ - StringFunction("trim", "trim({col})", "trim({col})"), - StringFunction("ltrim", "ltrim({col})", "ltrim({col})"), - StringFunction("rtrim", "rtrim({col})", "rtrim({col})"), - StringFunction("lower", "lower({col})", "lower({col})"), - StringFunction("upper", "upper({col})", "upper({col})"), - StringFunction("length", "length({col})", "length({col})"), - StringFunction("char_length", "char_length({col})", "length({col})"), - StringFunction("reverse", "reverse({col})", "reverse({col})"), - StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"), - StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"), - StringFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"), - StringFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"), - StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"), - StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"), - StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"), - StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"), - StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"), - StringFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"), - StringFunction("ascii", "ascii({col})", "ascii({col})"), - StringFunction("md5", "md5({col})", "md5({col})"), - StringFunction("sha256", "sha256({col})", "sha256({col})"), - StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"), - StringFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"), - StringFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"), - StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"), - StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"), - StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"), -] - - -def generate_test_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: - """Generate test data with various string patterns.""" - import random - import string - - random.seed(42) # For reproducibility - - # Generate diverse string data - strings = [] - for i in range(num_rows): - # Mix of different string patterns - pattern_type = i % 5 - if pattern_type == 0: - # Short strings with spaces - s = f" test_{i % 1000} " - elif pattern_type == 1: - # Longer strings - s = ''.join(random.choices(string.ascii_lowercase, k=20)) - elif pattern_type == 2: - # Mixed case with numbers - s = f"TestData_{i}_Value" - elif pattern_type == 3: - # Strings with special patterns - s = f"hello world {i % 100} data" - else: - # Random length strings - length = random.randint(5, 50) - s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length)) - strings.append(s) - - str_type = pa.string_view() if use_string_view else pa.string() - table = pa.table({ - 'str_col': pa.array(strings, type=str_type) - }) - - return table - - def setup_datafusion(parquet_path: str) -> datafusion.SessionContext: """Create and configure DataFusion context with single thread/partition.""" config = datafusion.SessionConfig().with_target_partitions(1) @@ -167,20 +88,20 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str, return sum(times) / len(times) -def run_benchmarks(num_rows: int = 1_000_000, +def run_benchmarks(suite: Suite, + num_rows: int = 1_000_000, warmup: int = 2, iterations: int = 5, use_string_view: bool = False) -> list[BenchmarkResult]: - """Run all benchmarks and return results.""" + """Run all benchmarks for a suite and return results.""" results = [] with tempfile.TemporaryDirectory() as tmpdir: parquet_path = os.path.join(tmpdir, 'test_data.parquet') # Generate and save test data - str_type = "StringView" if use_string_view else "String" - print(f"Generating {num_rows:,} rows of test data (type: {str_type})...") - table = generate_test_data(num_rows, use_string_view) + print(f"Generating {num_rows:,} rows of test data for '{suite.name}' suite...") + table = suite.generate_data(num_rows, use_string_view) pq.write_table(table, parquet_path) print(f"Parquet file written to: {parquet_path}") print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB") @@ -195,8 +116,8 @@ def run_benchmarks(num_rows: int = 1_000_000, # Run benchmarks print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations each)...\n") - col = 'str_col' - for func in STRING_FUNCTIONS: + col = suite.column_name + for func in suite.functions: df_expr = func.datafusion_expr.format(col=col) duck_expr = func.duckdb_expr.format(col=col) @@ -235,12 +156,15 @@ def run_benchmarks(num_rows: int = 1_000_000, return results -def format_results_markdown(results: list[BenchmarkResult], use_string_view: bool = False) -> str: +def format_results_markdown(results: list[BenchmarkResult], + suite: Suite, + use_string_view: bool = False) -> str: """Format benchmark results as a markdown table.""" str_type = "StringView" if use_string_view else "String" lines = [ - "# String Function Microbenchmarks: DataFusion vs DuckDB", + f"# {suite.description}: DataFusion vs DuckDB", "", + f"**Suite:** {suite.name} ", f"**DataFusion version:** {datafusion.__version__} ", f"**DuckDB version:** {duckdb.__version__} ", f"**Rows:** {results[0].rows:,} ", @@ -298,8 +222,15 @@ def format_results_markdown(results: list[BenchmarkResult], use_string_view: boo def main(): import argparse + available_suites = list_suites() + parser = argparse.ArgumentParser( - description="Benchmark string functions: DataFusion vs DuckDB" + description="Benchmark SQL functions: DataFusion vs DuckDB" + ) + parser.add_argument( + "--suite", type=str, default="strings", + choices=available_suites, + help=f"Benchmark suite to run (default: strings). Available: {', '.join(available_suites)}" ) parser.add_argument( "--rows", type=int, default=1_000_000, @@ -324,18 +255,21 @@ def main(): args = parser.parse_args() + suite = get_suite(args.suite) + print("=" * 60) - print("String Function Microbenchmarks: DataFusion vs DuckDB") + print(f"{suite.description}: DataFusion vs DuckDB") print("=" * 60) results = run_benchmarks( + suite=suite, num_rows=args.rows, warmup=args.warmup, iterations=args.iterations, use_string_view=args.string_view ) - markdown = format_results_markdown(results, use_string_view=args.string_view) + markdown = format_results_markdown(results, suite=suite, use_string_view=args.string_view) print("\n" + "=" * 60) print("RESULTS") diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py new file mode 100644 index 0000000..dbfc859 --- /dev/null +++ b/microbenchmarks/suites/__init__.py @@ -0,0 +1,47 @@ +"""Benchmark suites for microbenchmarks.""" + +from dataclasses import dataclass +from typing import Callable +import pyarrow as pa + + +@dataclass +class BenchmarkFunction: + """Defines a function with syntax for both engines.""" + name: str + datafusion_expr: str # Expression using {col} as placeholder for column name + duckdb_expr: str # Expression using {col} as placeholder for column name + + +@dataclass +class Suite: + """Defines a benchmark suite.""" + name: str + description: str + column_name: str + functions: list[BenchmarkFunction] + generate_data: Callable[[int, bool], pa.Table] # (num_rows, use_string_view) -> Table + + +# Import suites to register them +from . import strings +from . import temporal + +# Registry of available suites +SUITES: dict[str, Suite] = { + 'strings': strings.SUITE, + 'temporal': temporal.SUITE, +} + + +def get_suite(name: str) -> Suite: + """Get a suite by name.""" + if name not in SUITES: + available = ', '.join(SUITES.keys()) + raise ValueError(f"Unknown suite: {name}. Available: {available}") + return SUITES[name] + + +def list_suites() -> list[str]: + """List available suite names.""" + return list(SUITES.keys()) diff --git a/microbenchmarks/suites/strings.py b/microbenchmarks/suites/strings.py new file mode 100644 index 0000000..05f730e --- /dev/null +++ b/microbenchmarks/suites/strings.py @@ -0,0 +1,74 @@ +"""String functions benchmark suite.""" + +import random +import string + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + BenchmarkFunction("trim", "trim({col})", "trim({col})"), + BenchmarkFunction("ltrim", "ltrim({col})", "ltrim({col})"), + BenchmarkFunction("rtrim", "rtrim({col})", "rtrim({col})"), + BenchmarkFunction("lower", "lower({col})", "lower({col})"), + BenchmarkFunction("upper", "upper({col})", "upper({col})"), + BenchmarkFunction("length", "length({col})", "length({col})"), + BenchmarkFunction("char_length", "char_length({col})", "length({col})"), + BenchmarkFunction("reverse", "reverse({col})", "reverse({col})"), + BenchmarkFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"), + BenchmarkFunction("concat", "concat({col}, {col})", "concat({col}, {col})"), + BenchmarkFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"), + BenchmarkFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"), + BenchmarkFunction("left_5", "left({col}, 5)", "left({col}, 5)"), + BenchmarkFunction("right_5", "right({col}, 5)", "right({col}, 5)"), + BenchmarkFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"), + BenchmarkFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"), + BenchmarkFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"), + BenchmarkFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"), + BenchmarkFunction("ascii", "ascii({col})", "ascii({col})"), + BenchmarkFunction("md5", "md5({col})", "md5({col})"), + BenchmarkFunction("sha256", "sha256({col})", "sha256({col})"), + BenchmarkFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"), + BenchmarkFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"), + BenchmarkFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"), + BenchmarkFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"), + BenchmarkFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"), + BenchmarkFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various string patterns.""" + random.seed(42) # For reproducibility + + strings_data = [] + for i in range(num_rows): + pattern_type = i % 5 + if pattern_type == 0: + s = f" test_{i % 1000} " + elif pattern_type == 1: + s = ''.join(random.choices(string.ascii_lowercase, k=20)) + elif pattern_type == 2: + s = f"TestData_{i}_Value" + elif pattern_type == 3: + s = f"hello world {i % 100} data" + else: + length = random.randint(5, 50) + s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length)) + strings_data.append(s) + + str_type = pa.string_view() if use_string_view else pa.string() + return pa.table({ + 'str_col': pa.array(strings_data, type=str_type) + }) + + +SUITE = Suite( + name="strings", + description="String function benchmarks", + column_name="str_col", + functions=FUNCTIONS, + generate_data=generate_data, +) diff --git a/microbenchmarks/suites/temporal.py b/microbenchmarks/suites/temporal.py new file mode 100644 index 0000000..6f64286 --- /dev/null +++ b/microbenchmarks/suites/temporal.py @@ -0,0 +1,91 @@ +"""Temporal (date/time) functions benchmark suite.""" + +import random +from datetime import datetime, timedelta + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # Date extraction functions + BenchmarkFunction("year", "year({col})", "year({col})"), + BenchmarkFunction("month", "month({col})", "month({col})"), + BenchmarkFunction("day", "day({col})", "day({col})"), + BenchmarkFunction("hour", "hour({col})", "hour({col})"), + BenchmarkFunction("minute", "minute({col})", "minute({col})"), + BenchmarkFunction("second", "second({col})", "second({col})"), + BenchmarkFunction("week", "week({col})", "week({col})"), + BenchmarkFunction("quarter", "quarter({col})", "quarter({col})"), + BenchmarkFunction("day_of_week", "extract(dow from {col})", "dayofweek({col})"), + BenchmarkFunction("day_of_year", "extract(doy from {col})", "dayofyear({col})"), + + # Date truncation + BenchmarkFunction("date_trunc_day", "date_trunc('day', {col})", "date_trunc('day', {col})"), + BenchmarkFunction("date_trunc_month", "date_trunc('month', {col})", "date_trunc('month', {col})"), + BenchmarkFunction("date_trunc_year", "date_trunc('year', {col})", "date_trunc('year', {col})"), + BenchmarkFunction("date_trunc_hour", "date_trunc('hour', {col})", "date_trunc('hour', {col})"), + + # Date arithmetic + BenchmarkFunction("date_add_days", "{col} + interval '7 days'", "{col} + interval '7 days'"), + BenchmarkFunction("date_sub_days", "{col} - interval '7 days'", "{col} - interval '7 days'"), + BenchmarkFunction("date_add_months", "{col} + interval '1 month'", "{col} + interval '1 month'"), + + # Date formatting/parsing + BenchmarkFunction("to_char", "to_char({col}, '%Y-%m-%d')", "strftime({col}, '%Y-%m-%d')"), + + # Date parts + BenchmarkFunction("date_part_hour", "date_part('hour', {col})", "date_part('hour', {col})"), + BenchmarkFunction("date_part_minute", "date_part('minute', {col})", "date_part('minute', {col})"), + + # Current date/time comparisons + BenchmarkFunction("is_past", "{col} < now()", "{col} < now()"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various timestamp patterns.""" + random.seed(42) # For reproducibility + + # Generate timestamps spanning several years + base_date = datetime(2020, 1, 1) + max_days = 365 * 5 # 5 years of data + + timestamps = [] + for i in range(num_rows): + # Mix of different timestamp patterns + pattern_type = i % 4 + if pattern_type == 0: + # Random timestamp within range + days = random.randint(0, max_days) + hours = random.randint(0, 23) + minutes = random.randint(0, 59) + seconds = random.randint(0, 59) + ts = base_date + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) + elif pattern_type == 1: + # Timestamps at midnight (common pattern) + days = random.randint(0, max_days) + ts = base_date + timedelta(days=days) + elif pattern_type == 2: + # Timestamps at specific hours (business hours) + days = random.randint(0, max_days) + hours = random.choice([9, 10, 11, 12, 13, 14, 15, 16, 17]) + ts = base_date + timedelta(days=days, hours=hours) + else: + # Sequential timestamps (time series pattern) + ts = base_date + timedelta(seconds=i) + timestamps.append(ts) + + return pa.table({ + 'ts_col': pa.array(timestamps, type=pa.timestamp('us')) + }) + + +SUITE = Suite( + name="temporal", + description="Date/time function benchmarks", + column_name="ts_col", + functions=FUNCTIONS, + generate_data=generate_data, +) From 87f5bdab1f4da5c23dc5177fee6cb14f4129d408 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:06:31 -0700 Subject: [PATCH 2/6] fix --- microbenchmarks/README.md | 88 ++++++++++++++++-- .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 1603 bytes .../__pycache__/strings.cpython-310.pyc | Bin 0 -> 2506 bytes .../__pycache__/temporal.cpython-310.pyc | Bin 0 -> 2607 bytes microbenchmarks/suites/temporal.py | 20 ++-- 5 files changed, 90 insertions(+), 18 deletions(-) create mode 100644 microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc create mode 100644 microbenchmarks/suites/__pycache__/strings.cpython-310.pyc create mode 100644 microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md index d4ddc1f..b086a22 100644 --- a/microbenchmarks/README.md +++ b/microbenchmarks/README.md @@ -25,6 +25,15 @@ This directory contains microbenchmarks for comparing DataFusion and DuckDB perf The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics. +Benchmarks are organized into **suites**, each focusing on a specific category of SQL functions: + +| Suite | Description | Functions | +|-------|-------------|-----------| +| `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 | +| `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 | + +All benchmarks run in single-threaded mode for fair comparison between engines. + ## Setup Create a virtual environment and install dependencies: @@ -41,36 +50,44 @@ pip install -r requirements.txt Run a benchmark: ```shell -python microbenchmarks.py +python microbenchmarks.py --suite strings ``` ### Options | Option | Default | Description | |--------|---------|-------------| +| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`) | | `--rows` | `1000000` | Number of rows in the generated test data | | `--warmup` | `2` | Number of warmup iterations before timing | | `--iterations` | `5` | Number of timed iterations (results are averaged) | | `--output` | stdout | Output file path for markdown results | +| `--string-view` | `false` | Use Arrow StringView type instead of String | ### Examples -Run the benchmark with default settings: +Run the string functions benchmark (default): ```shell -python microbenchmark.py +python microbenchmarks.py ``` -Run the benchmark with 10 million rows: +Run the temporal functions benchmark: ```shell -python microbenchmarks.py --rows 10000000 +python microbenchmarks.py --suite temporal ``` -Run the benchmark and save results to a file: +Run with 10 million rows: ```shell -python microbenchmarks.py --output results.md +python microbenchmarks.py --suite strings --rows 10000000 +``` + +Run with StringView type and save results: + +```shell +python microbenchmarks.py --suite strings --string-view --output results.md ``` ## Output @@ -83,4 +100,59 @@ The benchmark outputs a markdown table comparing execution times: | lower | 8.90 | 7.50 | 1.19x | DuckDB | | ... | ... | ... | ... | ... | -A summary section shows overall statistics including how many functions each engine won and total execution times. \ No newline at end of file +A summary section shows overall statistics including how many functions each engine won and total execution times. + +## Project Structure + +``` +microbenchmarks/ +├── microbenchmarks.py # Main benchmark runner +├── requirements.txt # Python dependencies +└── suites/ # Benchmark suite definitions + ├── __init__.py # Suite registry and base classes + ├── strings.py # String function benchmarks + └── temporal.py # Date/time function benchmarks +``` + +## Adding New Suites + +To add a new benchmark suite: + +1. Create a new file in `suites/` (e.g., `suites/numeric.py`) + +2. Define your functions and data generator: + +```python +from . import BenchmarkFunction, Suite +import pyarrow as pa + +FUNCTIONS = [ + BenchmarkFunction("abs", "abs({col})", "abs({col})"), + BenchmarkFunction("sqrt", "sqrt({col})", "sqrt({col})"), + # ... more functions +] + +def generate_data(num_rows: int, use_string_view: bool = False) -> pa.Table: + # Generate appropriate test data + return pa.table({'num_col': pa.array(...)}) + +SUITE = Suite( + name="numeric", + description="Numeric function benchmarks", + column_name="num_col", + functions=FUNCTIONS, + generate_data=generate_data, +) +``` + +3. Register the suite in `suites/__init__.py`: + +```python +from . import numeric + +SUITES: dict[str, Suite] = { + 'strings': strings.SUITE, + 'temporal': temporal.SUITE, + 'numeric': numeric.SUITE, # Add new suite here +} +``` \ No newline at end of file diff --git a/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc b/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29c00ebb00c9cde6401dd62130527804bf8df5b2 GIT binary patch literal 1603 zcma)6O>Y}F5GA>kwED7S#|i4BEfz>~P|-+5kAYF7bSbi!graUEpAgvY#G^Q;nAHG4`_MO9xCRlQ8)m0PL&A3U4Zk)Ds^R%T}KeM>4w{%B`Beq9$SGD(v0$*W-tFGF9 z$>VMCc64WtjF{R{w;*y$@gvgPT|D?&=gG-LnzP8xQm1V+Doiv<6H^S=8QXsgO_3h; zbtRolGHES1;epIDIn4B7g6W$h0Ov7G`zaVrVhVULK!wJE=DMQTJ}IBU>dB$z3e;2e z+t>AcJIfQ77Wv}yZ}ce5p)whbR3n#HK zE5vWaY(a;m`Rj0SQcUzf=4w6|r*0spGC9$MO%)F}hz{OEc(4S2AViv{PKf?=ZctJo zf{4m%5Y9px(iSb_|Bd)F*!=$x*Hv8u#Z*)USO|4pH^vM#Rq6D3S&F7_DsU^{t8HS^DT3phNs-Mad5N&EuZgq1J=VE4(rJOf9QN7uRKQuZdh2ZI=HOCb(A zuEG1$d|XP}>B+P(GV3vO53=S%FurN6RLwM!T8G6Fj^g3>JvM0ZjV^m`mcw z3u)wF8ZpN{EK@2o>00 zu+O4J=Quyhi}QS$dlW6&{phO~GA+NpM^TTPkKj_zPm477{7lcSZ+tJanSNqSVW5|i zp1NEgxd9g<;MS~*%sXey_x&qBrx9@0eA;6E%M}uquAmVg*NEo-ZLO9 zy1r}G4>n`_y};at6J@z3!qxTXvmwqmk@96d_91Mth;NITPvFtpEd>HvTmOl)HJJBR zH=hEHOO@27a|ykln-3thR1ZqQlN+>FN?KRYsWfpL)s4~Jj_dU4$i}Q zi9Ub}a6zJrV1q5uC0Kw3i7vwxxFXS2xCYlGs=*>GO60&2EJ^esEW@%y*I@-#B&x$I ztVy&E8?YhK4cLTDiEhFzxFyj?a2sw*^f7z_ci!fj=Nfr<0-uor+#^N!oapEsLlQKMhXEZ9HZ7a^x?9z5hgQ#imbs|3v;)vP1B=TErkG`^5CrHUk zPbBME50j%f8lSp9q_+kmxUntlxQ>q6SxK~u-}!NPC2(Ib4k^@IG-$R2F(JV?6DA(G>#Ny}07GJZb=j^h)x^LdEx^CfBndsA zWOKN10frVe50_B;8a)(J>P0N{Qj#r<;CnL?#2sO}w!U#=6U_~e`9YACrRbYEJlBTJ ztccA}VX~IDzIijN^yN@(?8=QXCB^6;cFoBq1u?GSMV5p?>Lwmdv$OB7>VwO$!<%lN8;jU_uH;FrlWOntR z{gLfAo`~5?5j4zDw)9`j(6(7~diU;6C|f)qb=oeC4;Y{5Fyaoj)$4#9aFdd>LnB`k zCwB1UQCj#tz>ocoO{yGYWj#?15h`%0r1Ku3)+ ztNfsV`D5#4DC0@mAi+KGz%3i2i7Z$r78!%-DE>N0ApsNI4I z^fxK}O-W`J=VE~d(7J{}0i<%T4!%7j{Sw z2Pj#&MZz$)4`LdE-22;w%^86m(8>@~5- z?#DsIEx#29K4Ef4VpZW2^1!*WdiowCc=B;@5T(2_V2D^64uGH;q$oWstr2i7@x&pK zrAh7~BlUJef(MyW@6gH9OE^KgfDJGA&6q3V$JKzB2k~;ngt^M7xaWCZtVC#jQs|ag z-52#%+$MFL&tV<^UH1~tZ;`t2ve#jFW7S0HV)i!s@!s2h<(?kBFFR=Hrz@MF|P3e z{Zy#Lw5ix1R2O7n>-n<>J5RrSw#{=QbNX%^huqi^`RLEv*na+W=aG!+So2s6&fjUr m&-b}2_gSG zkHR|sldn}p0i~^g3R+KXt9aLYTBxzWvo~SBVYl@EtrQMN)-l$7Z_*+Lq$J7`C=a z*pI2-b5_7-q{V`exX#`n>aZY=5@(l0oo?vUH%>AL7+IT$GHkjUw*_6Wc~PuDdd#0- zH57cd@N{amkvZb#mN4hWA@S+(ys-89K4mLSp=jQ6e(A)$u{$gaaU~VF8O9Oo4lm@` zQZu7hq&(&8;2#bbb8M7n+!9JtrqPWD^I-_Zj9V7slw1gdXu!yDDd#y=%?zrk;>08& zoj8IylT?*6NflEQwaeG9CWB)Xd%#1d*~>|JrHP7@JvrK9L!`{K=xMSu z?tImGJ;#|{|ak8p7fZitTL zrl@wpt+dDp(vM5xs`Q#5i=w&3Nl{=iS|??(i1JOUVyTG*C#^DRh-Cp6SCgi6Szwhm z@x}`*@$f^ti#D)5-17|5+Y7UOLHJ!qtFRvgCOp zs9EIny6_@{*I4qqLSLZ#&t|I|hoptU9k%uZ*7Ey)r%PI*HhY5v2jA+++)G+v(4lco zmV+>9v21Xz^$&S!a6_&&^^{uEE66#c z7XG6z(TLkq&%?{l@s;IBKAFg~ft=90$aYO`L_SX8GLWQ0gT5Hsyo^N}gpnto6i=qP z^pADDgoYb#om-vKkhggsJfQlaD3N^*!g22U%oy}+4ufE^h;yTuq z8@n;KgSjn=m_~1I?Y!E4A=8{&HiWVLIE-P?BTqyVCqiW?wn&85&`M@W{piv&O?uZ$ S;;tKiE?v{EYwGkW{rf)~hYptj literal 0 HcmV?d00001 diff --git a/microbenchmarks/suites/temporal.py b/microbenchmarks/suites/temporal.py index 6f64286..3e1c02e 100644 --- a/microbenchmarks/suites/temporal.py +++ b/microbenchmarks/suites/temporal.py @@ -10,16 +10,16 @@ FUNCTIONS = [ # Date extraction functions - BenchmarkFunction("year", "year({col})", "year({col})"), - BenchmarkFunction("month", "month({col})", "month({col})"), - BenchmarkFunction("day", "day({col})", "day({col})"), - BenchmarkFunction("hour", "hour({col})", "hour({col})"), - BenchmarkFunction("minute", "minute({col})", "minute({col})"), - BenchmarkFunction("second", "second({col})", "second({col})"), - BenchmarkFunction("week", "week({col})", "week({col})"), - BenchmarkFunction("quarter", "quarter({col})", "quarter({col})"), - BenchmarkFunction("day_of_week", "extract(dow from {col})", "dayofweek({col})"), - BenchmarkFunction("day_of_year", "extract(doy from {col})", "dayofyear({col})"), + BenchmarkFunction("year", "date_part('year', {col})", "year({col})"), + BenchmarkFunction("month", "date_part('month', {col})", "month({col})"), + BenchmarkFunction("day", "date_part('day', {col})", "day({col})"), + BenchmarkFunction("hour", "date_part('hour', {col})", "hour({col})"), + BenchmarkFunction("minute", "date_part('minute', {col})", "minute({col})"), + BenchmarkFunction("second", "date_part('second', {col})", "second({col})"), + BenchmarkFunction("week", "date_part('week', {col})", "week({col})"), + BenchmarkFunction("quarter", "date_part('quarter', {col})", "quarter({col})"), + BenchmarkFunction("day_of_week", "date_part('dow', {col})", "dayofweek({col})"), + BenchmarkFunction("day_of_year", "date_part('doy', {col})", "dayofyear({col})"), # Date truncation BenchmarkFunction("date_trunc_day", "date_trunc('day', {col})", "date_trunc('day', {col})"), From f49e4fc6358e6f914b081a816c36848f9139a21f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:07:09 -0700 Subject: [PATCH 3/6] delete cache --- .../suites/__pycache__/__init__.cpython-310.pyc | Bin 1603 -> 0 bytes .../suites/__pycache__/strings.cpython-310.pyc | Bin 2506 -> 0 bytes .../suites/__pycache__/temporal.cpython-310.pyc | Bin 2607 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc delete mode 100644 microbenchmarks/suites/__pycache__/strings.cpython-310.pyc delete mode 100644 microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc diff --git a/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc b/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 29c00ebb00c9cde6401dd62130527804bf8df5b2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1603 zcma)6O>Y}F5GA>kwED7S#|i4BEfz>~P|-+5kAYF7bSbi!graUEpAgvY#G^Q;nAHG4`_MO9xCRlQ8)m0PL&A3U4Zk)Ds^R%T}KeM>4w{%B`Beq9$SGD(v0$*W-tFGF9 z$>VMCc64WtjF{R{w;*y$@gvgPT|D?&=gG-LnzP8xQm1V+Doiv<6H^S=8QXsgO_3h; zbtRolGHES1;epIDIn4B7g6W$h0Ov7G`zaVrVhVULK!wJE=DMQTJ}IBU>dB$z3e;2e z+t>AcJIfQ77Wv}yZ}ce5p)whbR3n#HK zE5vWaY(a;m`Rj0SQcUzf=4w6|r*0spGC9$MO%)F}hz{OEc(4S2AViv{PKf?=ZctJo zf{4m%5Y9px(iSb_|Bd)F*!=$x*Hv8u#Z*)USO|4pH^vM#Rq6D3S&F7_DsU^{t8HS^DT3phNs-Mad5N&EuZgq1J=VE4(rJOf9QN7uRKQuZdh2ZI=HOCb(A zuEG1$d|XP}>B+P(GV3vO53=S%FurN6RLwM!T8G6Fj^g3>JvM0ZjV^m`mcw z3u)wF8ZpN{EK@2o>00 zu+O4J=Quyhi}QS$dlW6&{phO~GA+NpM^TTPkKj_zPm477{7lcSZ+tJanSNqSVW5|i zp1NEgxd9g<;MS~*%sXey_x&qBrx9@0eA;6E%M}uquAmVg*NEo-ZLO9 zy1r}G4>n`_y};at6J@z3!qxTXvmwqmk@96d_91Mth;NITPvFtpEd>HvTmOl)HJJBR zH=hEHOO@27a|ykln-3thR1ZqQlN+>FN?KRYsWfpL)s4~Jj_dU4$i}Q zi9Ub}a6zJrV1q5uC0Kw3i7vwxxFXS2xCYlGs=*>GO60&2EJ^esEW@%y*I@-#B&x$I ztVy&E8?YhK4cLTDiEhFzxFyj?a2sw*^f7z_ci!fj=Nfr<0-uor+#^N!oapEsLlQKMhXEZ9HZ7a^x?9z5hgQ#imbs|3v;)vP1B=TErkG`^5CrHUk zPbBME50j%f8lSp9q_+kmxUntlxQ>q6SxK~u-}!NPC2(Ib4k^@IG-$R2F(JV?6DA(G>#Ny}07GJZb=j^h)x^LdEx^CfBndsA zWOKN10frVe50_B;8a)(J>P0N{Qj#r<;CnL?#2sO}w!U#=6U_~e`9YACrRbYEJlBTJ ztccA}VX~IDzIijN^yN@(?8=QXCB^6;cFoBq1u?GSMV5p?>Lwmdv$OB7>VwO$!<%lN8;jU_uH;FrlWOntR z{gLfAo`~5?5j4zDw)9`j(6(7~diU;6C|f)qb=oeC4;Y{5Fyaoj)$4#9aFdd>LnB`k zCwB1UQCj#tz>ocoO{yGYWj#?15h`%0r1Ku3)+ ztNfsV`D5#4DC0@mAi+KGz%3i2i7Z$r78!%-DE>N0ApsNI4I z^fxK}O-W`J=VE~d(7J{}0i<%T4!%7j{Sw z2Pj#&MZz$)4`LdE-22;w%^86m(8>@~5- z?#DsIEx#29K4Ef4VpZW2^1!*WdiowCc=B;@5T(2_V2D^64uGH;q$oWstr2i7@x&pK zrAh7~BlUJef(MyW@6gH9OE^KgfDJGA&6q3V$JKzB2k~;ngt^M7xaWCZtVC#jQs|ag z-52#%+$MFL&tV<^UH1~tZ;`t2ve#jFW7S0HV)i!s@!s2h<(?kBFFR=Hrz@MF|P3e z{Zy#Lw5ix1R2O7n>-n<>J5RrSw#{=QbNX%^huqi^`RLEv*na+W=aG!+So2s6&fjUr m&-b}2_gSG zkHR|sldn}p0i~^g3R+KXt9aLYTBxzWvo~SBVYl@EtrQMN)-l$7Z_*+Lq$J7`C=a z*pI2-b5_7-q{V`exX#`n>aZY=5@(l0oo?vUH%>AL7+IT$GHkjUw*_6Wc~PuDdd#0- zH57cd@N{amkvZb#mN4hWA@S+(ys-89K4mLSp=jQ6e(A)$u{$gaaU~VF8O9Oo4lm@` zQZu7hq&(&8;2#bbb8M7n+!9JtrqPWD^I-_Zj9V7slw1gdXu!yDDd#y=%?zrk;>08& zoj8IylT?*6NflEQwaeG9CWB)Xd%#1d*~>|JrHP7@JvrK9L!`{K=xMSu z?tImGJ;#|{|ak8p7fZitTL zrl@wpt+dDp(vM5xs`Q#5i=w&3Nl{=iS|??(i1JOUVyTG*C#^DRh-Cp6SCgi6Szwhm z@x}`*@$f^ti#D)5-17|5+Y7UOLHJ!qtFRvgCOp zs9EIny6_@{*I4qqLSLZ#&t|I|hoptU9k%uZ*7Ey)r%PI*HhY5v2jA+++)G+v(4lco zmV+>9v21Xz^$&S!a6_&&^^{uEE66#c z7XG6z(TLkq&%?{l@s;IBKAFg~ft=90$aYO`L_SX8GLWQ0gT5Hsyo^N}gpnto6i=qP z^pADDgoYb#om-vKkhggsJfQlaD3N^*!g22U%oy}+4ufE^h;yTuq z8@n;KgSjn=m_~1I?Y!E4A=8{&HiWVLIE-P?BTqyVCqiW?wn&85&`M@W{piv&O?uZ$ S;;tKiE?v{EYwGkW{rf)~hYptj From 319a951337f80059ba6cff08f59bb98773eac1cc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:07:13 -0700 Subject: [PATCH 4/6] delete cache --- microbenchmarks/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 microbenchmarks/.gitignore diff --git a/microbenchmarks/.gitignore b/microbenchmarks/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/microbenchmarks/.gitignore @@ -0,0 +1 @@ +__pycache__ From 97c22304cc743566ccb7d549de6f34a6d0e3e0e5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:11:59 -0700 Subject: [PATCH 5/6] numeric suite --- microbenchmarks/README.md | 3 +- microbenchmarks/suites/__init__.py | 2 + microbenchmarks/suites/numeric.py | 104 +++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 microbenchmarks/suites/numeric.py diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md index b086a22..1001816 100644 --- a/microbenchmarks/README.md +++ b/microbenchmarks/README.md @@ -31,6 +31,7 @@ Benchmarks are organized into **suites**, each focusing on a specific category o |-------|-------------|-----------| | `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 | | `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 | +| `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 | All benchmarks run in single-threaded mode for fair comparison between engines. @@ -57,7 +58,7 @@ python microbenchmarks.py --suite strings | Option | Default | Description | |--------|---------|-------------| -| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`) | +| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`) | | `--rows` | `1000000` | Number of rows in the generated test data | | `--warmup` | `2` | Number of warmup iterations before timing | | `--iterations` | `5` | Number of timed iterations (results are averaged) | diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py index dbfc859..9527f1a 100644 --- a/microbenchmarks/suites/__init__.py +++ b/microbenchmarks/suites/__init__.py @@ -26,11 +26,13 @@ class Suite: # Import suites to register them from . import strings from . import temporal +from . import numeric # Registry of available suites SUITES: dict[str, Suite] = { 'strings': strings.SUITE, 'temporal': temporal.SUITE, + 'numeric': numeric.SUITE, } diff --git a/microbenchmarks/suites/numeric.py b/microbenchmarks/suites/numeric.py new file mode 100644 index 0000000..9ad4ac4 --- /dev/null +++ b/microbenchmarks/suites/numeric.py @@ -0,0 +1,104 @@ +"""Numeric/math functions benchmark suite.""" + +import random + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # Basic math + BenchmarkFunction("abs", "abs({col})", "abs({col})"), + BenchmarkFunction("ceil", "ceil({col})", "ceil({col})"), + BenchmarkFunction("floor", "floor({col})", "floor({col})"), + BenchmarkFunction("round", "round({col}, 2)", "round({col}, 2)"), + BenchmarkFunction("trunc", "trunc({col})", "trunc({col})"), + BenchmarkFunction("signum", "signum({col})", "sign({col})"), + + # Powers and roots + BenchmarkFunction("sqrt", "sqrt(abs({col}))", "sqrt(abs({col}))"), + BenchmarkFunction("cbrt", "cbrt({col})", "cbrt({col})"), + BenchmarkFunction("power", "power({col}, 2)", "power({col}, 2)"), + BenchmarkFunction("exp", "exp({col} / 100)", "exp({col} / 100)"), + + # Logarithms + BenchmarkFunction("ln", "ln(abs({col}) + 1)", "ln(abs({col}) + 1)"), + BenchmarkFunction("log10", "log10(abs({col}) + 1)", "log10(abs({col}) + 1)"), + BenchmarkFunction("log2", "log2(abs({col}) + 1)", "log2(abs({col}) + 1)"), + BenchmarkFunction("log", "log(2, abs({col}) + 1)", "log(2, abs({col}) + 1)"), + + # Trigonometric + BenchmarkFunction("sin", "sin({col})", "sin({col})"), + BenchmarkFunction("cos", "cos({col})", "cos({col})"), + BenchmarkFunction("tan", "tan({col})", "tan({col})"), + BenchmarkFunction("asin", "asin(sin({col}))", "asin(sin({col}))"), + BenchmarkFunction("acos", "acos(cos({col}))", "acos(cos({col}))"), + BenchmarkFunction("atan", "atan({col})", "atan({col})"), + BenchmarkFunction("atan2", "atan2({col}, {col} + 1)", "atan2({col}, {col} + 1)"), + + # Hyperbolic + BenchmarkFunction("sinh", "sinh({col} / 100)", "sinh({col} / 100)"), + BenchmarkFunction("cosh", "cosh({col} / 100)", "cosh({col} / 100)"), + BenchmarkFunction("tanh", "tanh({col})", "tanh({col})"), + + # Other math functions + BenchmarkFunction("degrees", "degrees({col})", "degrees({col})"), + BenchmarkFunction("radians", "radians({col})", "radians({col})"), + BenchmarkFunction("pi", "pi() * {col}", "pi() * {col}"), + BenchmarkFunction("mod", "CAST({col} AS BIGINT) % 7", "CAST({col} AS BIGINT) % 7"), + BenchmarkFunction("gcd", "gcd(CAST({col} AS BIGINT), 12)", "gcd(CAST({col} AS BIGINT), 12)"), + BenchmarkFunction("lcm", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)"), + BenchmarkFunction("factorial", "factorial(CAST(abs({col}) AS BIGINT) % 20)", "factorial(CAST(abs({col}) AS INTEGER) % 20)"), + + # Comparison + BenchmarkFunction("greatest", "greatest({col}, {col} * 2, 0)", "greatest({col}, {col} * 2, 0)"), + BenchmarkFunction("least", "least({col}, {col} * 2, 0)", "least({col}, {col} * 2, 0)"), + + # Null handling with numeric + BenchmarkFunction("coalesce", "coalesce({col}, 0)", "coalesce({col}, 0)"), + BenchmarkFunction("nullif", "nullif({col}, 0)", "nullif({col}, 0)"), + + # Bitwise (on integers) + BenchmarkFunction("bit_and", "CAST({col} AS BIGINT) & 255", "CAST({col} AS BIGINT) & 255"), + BenchmarkFunction("bit_or", "CAST({col} AS BIGINT) | 255", "CAST({col} AS BIGINT) | 255"), + BenchmarkFunction("bit_xor", "CAST({col} AS BIGINT) ^ 255", "xor(CAST({col} AS BIGINT), 255)"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with various numeric patterns.""" + random.seed(42) # For reproducibility + + values = [] + for i in range(num_rows): + pattern_type = i % 5 + if pattern_type == 0: + # Small integers + v = random.randint(-100, 100) + elif pattern_type == 1: + # Larger integers + v = random.randint(-10000, 10000) + elif pattern_type == 2: + # Floating point values + v = random.uniform(-1000, 1000) + elif pattern_type == 3: + # Small decimals + v = random.uniform(-1, 1) + else: + # Mixed range + v = random.gauss(0, 500) + values.append(v) + + return pa.table({ + 'num_col': pa.array(values, type=pa.float64()) + }) + + +SUITE = Suite( + name="numeric", + description="Numeric function benchmarks", + column_name="num_col", + functions=FUNCTIONS, + generate_data=generate_data, +) From a254519618dbd43ff59a5fc67e4c56381a77d2d8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 31 Dec 2025 16:18:11 -0700 Subject: [PATCH 6/6] add conditional suite --- microbenchmarks/README.md | 3 +- microbenchmarks/suites/__init__.py | 2 + microbenchmarks/suites/conditional.py | 110 ++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 microbenchmarks/suites/conditional.py diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md index 1001816..1fbfdc6 100644 --- a/microbenchmarks/README.md +++ b/microbenchmarks/README.md @@ -32,6 +32,7 @@ Benchmarks are organized into **suites**, each focusing on a specific category o | `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 | | `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 | | `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 | +| `conditional` | Conditional logic (CASE, COALESCE, boolean ops, comparisons) | 36 | All benchmarks run in single-threaded mode for fair comparison between engines. @@ -58,7 +59,7 @@ python microbenchmarks.py --suite strings | Option | Default | Description | |--------|---------|-------------| -| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`) | +| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`, `conditional`) | | `--rows` | `1000000` | Number of rows in the generated test data | | `--warmup` | `2` | Number of warmup iterations before timing | | `--iterations` | `5` | Number of timed iterations (results are averaged) | diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py index 9527f1a..329ffba 100644 --- a/microbenchmarks/suites/__init__.py +++ b/microbenchmarks/suites/__init__.py @@ -27,12 +27,14 @@ class Suite: from . import strings from . import temporal from . import numeric +from . import conditional # Registry of available suites SUITES: dict[str, Suite] = { 'strings': strings.SUITE, 'temporal': temporal.SUITE, 'numeric': numeric.SUITE, + 'conditional': conditional.SUITE, } diff --git a/microbenchmarks/suites/conditional.py b/microbenchmarks/suites/conditional.py new file mode 100644 index 0000000..0ebbf2a --- /dev/null +++ b/microbenchmarks/suites/conditional.py @@ -0,0 +1,110 @@ +"""Conditional/logic functions benchmark suite.""" + +import random + +import pyarrow as pa + +from . import BenchmarkFunction, Suite + + +FUNCTIONS = [ + # CASE expressions + BenchmarkFunction("case_simple", + "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END", + "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END"), + BenchmarkFunction("case_searched", + "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END", + "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END"), + BenchmarkFunction("case_many_branches", + "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END", + "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END"), + BenchmarkFunction("case_nested", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END"), + + # NULL handling + BenchmarkFunction("coalesce_2", "COALESCE(nullable_col, 0)", "COALESCE(nullable_col, 0)"), + BenchmarkFunction("coalesce_3", "COALESCE(nullable_col, {col}, 0)", "COALESCE(nullable_col, {col}, 0)"), + BenchmarkFunction("coalesce_many", "COALESCE(nullable_col, NULL, NULL, {col}, 0)", "COALESCE(nullable_col, NULL, NULL, {col}, 0)"), + BenchmarkFunction("nullif", "NULLIF({col}, 0)", "NULLIF({col}, 0)"), + BenchmarkFunction("nullif_expr", "NULLIF({col} % 10, 5)", "NULLIF({col} % 10, 5)"), + BenchmarkFunction("ifnull", "IFNULL(nullable_col, -1)", "IFNULL(nullable_col, -1)"), + BenchmarkFunction("nvl", "NVL(nullable_col, -1)", "IFNULL(nullable_col, -1)"), + + # Comparison functions + BenchmarkFunction("greatest_2", "GREATEST({col}, {col} * -1)", "GREATEST({col}, {col} * -1)"), + BenchmarkFunction("greatest_3", "GREATEST({col}, 0, -100)", "GREATEST({col}, 0, -100)"), + BenchmarkFunction("least_2", "LEAST({col}, {col} * -1)", "LEAST({col}, {col} * -1)"), + BenchmarkFunction("least_3", "LEAST({col}, 0, 100)", "LEAST({col}, 0, 100)"), + + # Boolean logic + BenchmarkFunction("and_simple", "{col} > 0 AND {col} < 50", "{col} > 0 AND {col} < 50"), + BenchmarkFunction("or_simple", "{col} < -50 OR {col} > 50", "{col} < -50 OR {col} > 50"), + BenchmarkFunction("not", "NOT ({col} > 0)", "NOT ({col} > 0)"), + BenchmarkFunction("and_or_mixed", "({col} > 0 AND {col} < 50) OR {col} < -50", "({col} > 0 AND {col} < 50) OR {col} < -50"), + BenchmarkFunction("complex_bool", "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0", + "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0"), + + # Comparison operators + BenchmarkFunction("eq", "{col} = 0", "{col} = 0"), + BenchmarkFunction("neq", "{col} <> 0", "{col} <> 0"), + BenchmarkFunction("lt", "{col} < 0", "{col} < 0"), + BenchmarkFunction("lte", "{col} <= 0", "{col} <= 0"), + BenchmarkFunction("gt", "{col} > 0", "{col} > 0"), + BenchmarkFunction("gte", "{col} >= 0", "{col} >= 0"), + + # BETWEEN and IN + BenchmarkFunction("between", "{col} BETWEEN -50 AND 50", "{col} BETWEEN -50 AND 50"), + BenchmarkFunction("not_between", "{col} NOT BETWEEN -25 AND 25", "{col} NOT BETWEEN -25 AND 25"), + BenchmarkFunction("in_list_small", "{col} IN (1, 2, 3, 4, 5)", "{col} IN (1, 2, 3, 4, 5)"), + BenchmarkFunction("in_list_medium", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"), + BenchmarkFunction("not_in", "{col} NOT IN (1, 2, 3, 4, 5)", "{col} NOT IN (1, 2, 3, 4, 5)"), + + # NULL checks + BenchmarkFunction("is_null", "nullable_col IS NULL", "nullable_col IS NULL"), + BenchmarkFunction("is_not_null", "nullable_col IS NOT NULL", "nullable_col IS NOT NULL"), + + # IF (conditional expression) - DataFusion uses CASE, DuckDB has IF + BenchmarkFunction("if_simple", + "CASE WHEN {col} > 0 THEN 'positive' ELSE 'non-positive' END", + "IF({col} > 0, 'positive', 'non-positive')"), + BenchmarkFunction("if_numeric", + "CASE WHEN {col} > 0 THEN {col} ELSE 0 END", + "IF({col} > 0, {col}, 0)"), + BenchmarkFunction("if_nested", + "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END", + "IF({col} > 0, IF({col} > 50, 'high', 'low'), 'negative')"), +] + + +def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table: + """Generate test data with integers and nullable values.""" + random.seed(42) + + values = [] + nullable_values = [] + + for i in range(num_rows): + # Integer values in range for various conditional tests + v = random.randint(-100, 100) + values.append(v) + + # Nullable column: ~30% nulls + if random.random() < 0.3: + nullable_values.append(None) + else: + nullable_values.append(random.randint(-100, 100)) + + return pa.table({ + 'val_col': pa.array(values, type=pa.int64()), + 'nullable_col': pa.array(nullable_values, type=pa.int64()), + }) + + +SUITE = Suite( + name="conditional", + description="Conditional/logic function benchmarks", + column_name="val_col", + functions=FUNCTIONS, + generate_data=generate_data, +)