apache · andygrove · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/microbenchmarks/.gitignore b/microbenchmarks/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
@@ -25,6 +25,17 @@ This directory contains microbenchmarks for comparing DataFusion and DuckDB perf
 
 The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics.
 
+Benchmarks are organized into **suites**, each focusing on a specific category of SQL functions:
+
+| Suite | Description | Functions |
+|-------|-------------|-----------|
+| `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 |
+| `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 |
+| `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 |
+| `conditional` | Conditional logic (CASE, COALESCE, boolean ops, comparisons) | 36 |
+
+All benchmarks run in single-threaded mode for fair comparison between engines.
+
 ## Setup
 
 Create a virtual environment and install dependencies:
@@ -41,36 +52,44 @@ pip install -r requirements.txt
 Run a benchmark:
 
 ```shell
-python microbenchmarks.py
+python microbenchmarks.py --suite strings
 ```
 
 ### Options
 
 | Option | Default | Description |
 |--------|---------|-------------|
+| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`, `conditional`) |
 | `--rows` | `1000000` | Number of rows in the generated test data |
 | `--warmup` | `2` | Number of warmup iterations before timing |
 | `--iterations` | `5` | Number of timed iterations (results are averaged) |
 | `--output` | stdout | Output file path for markdown results |
+| `--string-view` | `false` | Use Arrow StringView type instead of String |
 
 ### Examples
 
-Run the benchmark with default settings:
+Run the string functions benchmark (default):
 
 ```shell
-python microbenchmark.py
+python microbenchmarks.py
 ```
 
-Run the benchmark with 10 million rows:
+Run the temporal functions benchmark:
 
 ```shell
-python microbenchmarks.py --rows 10000000
+python microbenchmarks.py --suite temporal
 ```
 
-Run the benchmark and save results to a file:
+Run with 10 million rows:
 
 ```shell
-python microbenchmarks.py --output results.md
+python microbenchmarks.py --suite strings --rows 10000000
+```
+
+Run with StringView type and save results:
+
+```shell
+python microbenchmarks.py --suite strings --string-view --output results.md
 ```
 
 ## Output
@@ -83,4 +102,59 @@ The benchmark outputs a markdown table comparing execution times:
 | lower | 8.90 | 7.50 | 1.19x | DuckDB |
 | ... | ... | ... | ... | ... |
 
-A summary section shows overall statistics including how many functions each engine won and total execution times.
+A summary section shows overall statistics including how many functions each engine won and total execution times.
+
+## Project Structure
+
+```
+microbenchmarks/
+├── microbenchmarks.py      # Main benchmark runner
+├── requirements.txt        # Python dependencies
+└── suites/                 # Benchmark suite definitions
+    ├── __init__.py         # Suite registry and base classes
+    ├── strings.py          # String function benchmarks
+    └── temporal.py         # Date/time function benchmarks
+```
+
+## Adding New Suites
+
+To add a new benchmark suite:
+
+1. Create a new file in `suites/` (e.g., `suites/numeric.py`)
+
+2. Define your functions and data generator:
+
+```python
+from . import BenchmarkFunction, Suite
+import pyarrow as pa
+
+FUNCTIONS = [
+    BenchmarkFunction("abs", "abs({col})", "abs({col})"),
+    BenchmarkFunction("sqrt", "sqrt({col})", "sqrt({col})"),
+    # ... more functions
+]
+
+def generate_data(num_rows: int, use_string_view: bool = False) -> pa.Table:
+    # Generate appropriate test data
+    return pa.table({'num_col': pa.array(...)})
+
+SUITE = Suite(
+    name="numeric",
+    description="Numeric function benchmarks",
+    column_name="num_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)
+```
+
+3. Register the suite in `suites/__init__.py`:
+
+```python
+from . import numeric
+
+SUITES: dict[str, Suite] = {
+    'strings': strings.SUITE,
+    'temporal': temporal.SUITE,
+    'numeric': numeric.SUITE,  # Add new suite here
+}
+```
diff --git a/microbenchmarks/microbenchmarks.py b/microbenchmarks/microbenchmarks.py
@@ -1,20 +1,20 @@
 #!/usr/bin/env python3
 """
 Microbenchmark comparing DataFusion and DuckDB performance
-for SQL string functions on Parquet files.
+for various SQL functions on Parquet files.
 """
 
 import tempfile
 import time
 import os
 from dataclasses import dataclass
-from pathlib import Path
 
-import pyarrow as pa
 import pyarrow.parquet as pq
 import datafusion
 import duckdb
 
+from suites import get_suite, list_suites, Suite
+
 
 @dataclass
 class BenchmarkResult:
@@ -32,85 +32,6 @@ def speedup(self) -> float:
         return self.duckdb_time_ms / self.datafusion_time_ms
 
 
-@dataclass
-class StringFunction:
-    """Defines a string function with syntax for both engines."""
-    name: str
-    datafusion_expr: str  # Expression using {col} as placeholder for column name
-    duckdb_expr: str      # Expression using {col} as placeholder for column name
-
-
-# String functions to benchmark
-# {col} will be replaced with the actual column name
-STRING_FUNCTIONS = [
-    StringFunction("trim", "trim({col})", "trim({col})"),
-    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
-    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
-    StringFunction("lower", "lower({col})", "lower({col})"),
-    StringFunction("upper", "upper({col})", "upper({col})"),
-    StringFunction("length", "length({col})", "length({col})"),
-    StringFunction("char_length", "char_length({col})", "length({col})"),
-    StringFunction("reverse", "reverse({col})", "reverse({col})"),
-    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
-    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
-    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"),
-    StringFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"),
-    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
-    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
-    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
-    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
-    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"),
-    StringFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"),
-    StringFunction("ascii", "ascii({col})", "ascii({col})"),
-    StringFunction("md5", "md5({col})", "md5({col})"),
-    StringFunction("sha256", "sha256({col})", "sha256({col})"),
-    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
-    StringFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"),
-    StringFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"),
-    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"),
-    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
-    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"),
-]
-
-
-def generate_test_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
-    """Generate test data with various string patterns."""
-    import random
-    import string
-
-    random.seed(42)  # For reproducibility
-
-    # Generate diverse string data
-    strings = []
-    for i in range(num_rows):
-        # Mix of different string patterns
-        pattern_type = i % 5
-        if pattern_type == 0:
-            # Short strings with spaces
-            s = f"  test_{i % 1000}  "
-        elif pattern_type == 1:
-            # Longer strings
-            s = ''.join(random.choices(string.ascii_lowercase, k=20))
-        elif pattern_type == 2:
-            # Mixed case with numbers
-            s = f"TestData_{i}_Value"
-        elif pattern_type == 3:
-            # Strings with special patterns
-            s = f"hello world {i % 100} data"
-        else:
-            # Random length strings
-            length = random.randint(5, 50)
-            s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
-        strings.append(s)
-
-    str_type = pa.string_view() if use_string_view else pa.string()
-    table = pa.table({
-        'str_col': pa.array(strings, type=str_type)
-    })
-
-    return table
-
-
 def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
     """Create and configure DataFusion context with single thread/partition."""
     config = datafusion.SessionConfig().with_target_partitions(1)
@@ -167,20 +88,20 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
     return sum(times) / len(times)
 
 
-def run_benchmarks(num_rows: int = 1_000_000,
+def run_benchmarks(suite: Suite,
+                   num_rows: int = 1_000_000,
                    warmup: int = 2,
                    iterations: int = 5,
                    use_string_view: bool = False) -> list[BenchmarkResult]:
-    """Run all benchmarks and return results."""
+    """Run all benchmarks for a suite and return results."""
     results = []
 
     with tempfile.TemporaryDirectory() as tmpdir:
         parquet_path = os.path.join(tmpdir, 'test_data.parquet')
 
         # Generate and save test data
-        str_type = "StringView" if use_string_view else "String"
-        print(f"Generating {num_rows:,} rows of test data (type: {str_type})...")
-        table = generate_test_data(num_rows, use_string_view)
+        print(f"Generating {num_rows:,} rows of test data for '{suite.name}' suite...")
+        table = suite.generate_data(num_rows, use_string_view)
         pq.write_table(table, parquet_path)
         print(f"Parquet file written to: {parquet_path}")
         print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
@@ -195,8 +116,8 @@ def run_benchmarks(num_rows: int = 1_000_000,
         # Run benchmarks
         print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations each)...\n")
 
-        col = 'str_col'
-        for func in STRING_FUNCTIONS:
+        col = suite.column_name
+        for func in suite.functions:
             df_expr = func.datafusion_expr.format(col=col)
             duck_expr = func.duckdb_expr.format(col=col)
 
@@ -235,12 +156,15 @@ def run_benchmarks(num_rows: int = 1_000_000,
     return results
 
 
-def format_results_markdown(results: list[BenchmarkResult], use_string_view: bool = False) -> str:
+def format_results_markdown(results: list[BenchmarkResult],
+                            suite: Suite,
+                            use_string_view: bool = False) -> str:
     """Format benchmark results as a markdown table."""
     str_type = "StringView" if use_string_view else "String"
     lines = [
-        "# String Function Microbenchmarks: DataFusion vs DuckDB",
+        f"# {suite.description}: DataFusion vs DuckDB",
         "",
+        f"**Suite:** {suite.name}  ",
         f"**DataFusion version:** {datafusion.__version__}  ",
         f"**DuckDB version:** {duckdb.__version__}  ",
         f"**Rows:** {results[0].rows:,}  ",
@@ -298,8 +222,15 @@ def format_results_markdown(results: list[BenchmarkResult], use_string_view: boo
 def main():
     import argparse
 
+    available_suites = list_suites()
+
     parser = argparse.ArgumentParser(
-        description="Benchmark string functions: DataFusion vs DuckDB"
+        description="Benchmark SQL functions: DataFusion vs DuckDB"
+    )
+    parser.add_argument(
+        "--suite", type=str, default="strings",
+        choices=available_suites,
+        help=f"Benchmark suite to run (default: strings). Available: {', '.join(available_suites)}"
     )
     parser.add_argument(
         "--rows", type=int, default=1_000_000,
@@ -324,18 +255,21 @@ def main():
 
     args = parser.parse_args()
 
+    suite = get_suite(args.suite)
+
     print("=" * 60)
-    print("String Function Microbenchmarks: DataFusion vs DuckDB")
+    print(f"{suite.description}: DataFusion vs DuckDB")
     print("=" * 60)
 
     results = run_benchmarks(
+        suite=suite,
         num_rows=args.rows,
         warmup=args.warmup,
         iterations=args.iterations,
         use_string_view=args.string_view
     )
 
-    markdown = format_results_markdown(results, use_string_view=args.string_view)
+    markdown = format_results_markdown(results, suite=suite, use_string_view=args.string_view)
 
     print("\n" + "=" * 60)
     print("RESULTS")

diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py
@@ -0,0 +1,51 @@
+"""Benchmark suites for microbenchmarks."""
+
+from dataclasses import dataclass
+from typing import Callable
+import pyarrow as pa
+
+
+@dataclass
+class BenchmarkFunction:
+    """Defines a function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column name
+    duckdb_expr: str      # Expression using {col} as placeholder for column name
+
+
+@dataclass
+class Suite:
+    """Defines a benchmark suite."""
+    name: str
+    description: str
+    column_name: str
+    functions: list[BenchmarkFunction]
+    generate_data: Callable[[int, bool], pa.Table]  # (num_rows, use_string_view) -> Table
+
+
+# Import suites to register them
+from . import strings
+from . import temporal
+from . import numeric
+from . import conditional
+
+# Registry of available suites
+SUITES: dict[str, Suite] = {
+    'strings': strings.SUITE,
+    'temporal': temporal.SUITE,
+    'numeric': numeric.SUITE,
+    'conditional': conditional.SUITE,
+}
+
+
+def get_suite(name: str) -> Suite:
+    """Get a suite by name."""
+    if name not in SUITES:
+        available = ', '.join(SUITES.keys())
+        raise ValueError(f"Unknown suite: {name}. Available: {available}")
+    return SUITES[name]
+
+
+def list_suites() -> list[str]:
+    """List available suite names."""
+    return list(SUITES.keys())