From d814701c7d976966cec6c5f6f530efd2200102a3 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:04:25 -0700
Subject: [PATCH 1/6] add temporal expression benchmarks

---
 microbenchmarks/microbenchmarks.py | 122 +++++++----------------------
 microbenchmarks/suites/__init__.py |  47 +++++++++++
 microbenchmarks/suites/strings.py  |  74 +++++++++++++++++
 microbenchmarks/suites/temporal.py |  91 +++++++++++++++++++++
 4 files changed, 240 insertions(+), 94 deletions(-)
 create mode 100644 microbenchmarks/suites/__init__.py
 create mode 100644 microbenchmarks/suites/strings.py
 create mode 100644 microbenchmarks/suites/temporal.py

diff --git a/microbenchmarks/microbenchmarks.py b/microbenchmarks/microbenchmarks.py
index c57483d..c904307 100755
--- a/microbenchmarks/microbenchmarks.py
+++ b/microbenchmarks/microbenchmarks.py
@@ -1,20 +1,20 @@
 #!/usr/bin/env python3
 """
 Microbenchmark comparing DataFusion and DuckDB performance
-for SQL string functions on Parquet files.
+for various SQL functions on Parquet files.
 """
 
 import tempfile
 import time
 import os
 from dataclasses import dataclass
-from pathlib import Path
 
-import pyarrow as pa
 import pyarrow.parquet as pq
 import datafusion
 import duckdb
 
+from suites import get_suite, list_suites, Suite
+
 
 @dataclass
 class BenchmarkResult:
@@ -32,85 +32,6 @@ def speedup(self) -> float:
         return self.duckdb_time_ms / self.datafusion_time_ms
 
 
-@dataclass
-class StringFunction:
-    """Defines a string function with syntax for both engines."""
-    name: str
-    datafusion_expr: str  # Expression using {col} as placeholder for column name
-    duckdb_expr: str      # Expression using {col} as placeholder for column name
-
-
-# String functions to benchmark
-# {col} will be replaced with the actual column name
-STRING_FUNCTIONS = [
-    StringFunction("trim", "trim({col})", "trim({col})"),
-    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
-    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
-    StringFunction("lower", "lower({col})", "lower({col})"),
-    StringFunction("upper", "upper({col})", "upper({col})"),
-    StringFunction("length", "length({col})", "length({col})"),
-    StringFunction("char_length", "char_length({col})", "length({col})"),
-    StringFunction("reverse", "reverse({col})", "reverse({col})"),
-    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
-    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
-    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"),
-    StringFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"),
-    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
-    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
-    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
-    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
-    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"),
-    StringFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"),
-    StringFunction("ascii", "ascii({col})", "ascii({col})"),
-    StringFunction("md5", "md5({col})", "md5({col})"),
-    StringFunction("sha256", "sha256({col})", "sha256({col})"),
-    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
-    StringFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"),
-    StringFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"),
-    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"),
-    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
-    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"),
-]
-
-
-def generate_test_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
-    """Generate test data with various string patterns."""
-    import random
-    import string
-
-    random.seed(42)  # For reproducibility
-
-    # Generate diverse string data
-    strings = []
-    for i in range(num_rows):
-        # Mix of different string patterns
-        pattern_type = i % 5
-        if pattern_type == 0:
-            # Short strings with spaces
-            s = f"  test_{i % 1000}  "
-        elif pattern_type == 1:
-            # Longer strings
-            s = ''.join(random.choices(string.ascii_lowercase, k=20))
-        elif pattern_type == 2:
-            # Mixed case with numbers
-            s = f"TestData_{i}_Value"
-        elif pattern_type == 3:
-            # Strings with special patterns
-            s = f"hello world {i % 100} data"
-        else:
-            # Random length strings
-            length = random.randint(5, 50)
-            s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
-        strings.append(s)
-
-    str_type = pa.string_view() if use_string_view else pa.string()
-    table = pa.table({
-        'str_col': pa.array(strings, type=str_type)
-    })
-
-    return table
-
-
 def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
     """Create and configure DataFusion context with single thread/partition."""
     config = datafusion.SessionConfig().with_target_partitions(1)
@@ -167,20 +88,20 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
     return sum(times) / len(times)
 
 
-def run_benchmarks(num_rows: int = 1_000_000,
+def run_benchmarks(suite: Suite,
+                   num_rows: int = 1_000_000,
                    warmup: int = 2,
                    iterations: int = 5,
                    use_string_view: bool = False) -> list[BenchmarkResult]:
-    """Run all benchmarks and return results."""
+    """Run all benchmarks for a suite and return results."""
     results = []
 
     with tempfile.TemporaryDirectory() as tmpdir:
         parquet_path = os.path.join(tmpdir, 'test_data.parquet')
 
         # Generate and save test data
-        str_type = "StringView" if use_string_view else "String"
-        print(f"Generating {num_rows:,} rows of test data (type: {str_type})...")
-        table = generate_test_data(num_rows, use_string_view)
+        print(f"Generating {num_rows:,} rows of test data for '{suite.name}' suite...")
+        table = suite.generate_data(num_rows, use_string_view)
         pq.write_table(table, parquet_path)
         print(f"Parquet file written to: {parquet_path}")
         print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
@@ -195,8 +116,8 @@ def run_benchmarks(num_rows: int = 1_000_000,
         # Run benchmarks
         print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations each)...\n")
 
-        col = 'str_col'
-        for func in STRING_FUNCTIONS:
+        col = suite.column_name
+        for func in suite.functions:
             df_expr = func.datafusion_expr.format(col=col)
             duck_expr = func.duckdb_expr.format(col=col)
 
@@ -235,12 +156,15 @@ def run_benchmarks(num_rows: int = 1_000_000,
     return results
 
 
-def format_results_markdown(results: list[BenchmarkResult], use_string_view: bool = False) -> str:
+def format_results_markdown(results: list[BenchmarkResult],
+                            suite: Suite,
+                            use_string_view: bool = False) -> str:
     """Format benchmark results as a markdown table."""
     str_type = "StringView" if use_string_view else "String"
     lines = [
-        "# String Function Microbenchmarks: DataFusion vs DuckDB",
+        f"# {suite.description}: DataFusion vs DuckDB",
         "",
+        f"**Suite:** {suite.name}  ",
         f"**DataFusion version:** {datafusion.__version__}  ",
         f"**DuckDB version:** {duckdb.__version__}  ",
         f"**Rows:** {results[0].rows:,}  ",
@@ -298,8 +222,15 @@ def format_results_markdown(results: list[BenchmarkResult], use_string_view: boo
 def main():
     import argparse
 
+    available_suites = list_suites()
+
     parser = argparse.ArgumentParser(
-        description="Benchmark string functions: DataFusion vs DuckDB"
+        description="Benchmark SQL functions: DataFusion vs DuckDB"
+    )
+    parser.add_argument(
+        "--suite", type=str, default="strings",
+        choices=available_suites,
+        help=f"Benchmark suite to run (default: strings). Available: {', '.join(available_suites)}"
     )
     parser.add_argument(
         "--rows", type=int, default=1_000_000,
@@ -324,18 +255,21 @@ def main():
 
     args = parser.parse_args()
 
+    suite = get_suite(args.suite)
+
     print("=" * 60)
-    print("String Function Microbenchmarks: DataFusion vs DuckDB")
+    print(f"{suite.description}: DataFusion vs DuckDB")
     print("=" * 60)
 
     results = run_benchmarks(
+        suite=suite,
         num_rows=args.rows,
         warmup=args.warmup,
         iterations=args.iterations,
         use_string_view=args.string_view
     )
 
-    markdown = format_results_markdown(results, use_string_view=args.string_view)
+    markdown = format_results_markdown(results, suite=suite, use_string_view=args.string_view)
 
     print("\n" + "=" * 60)
     print("RESULTS")
diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py
new file mode 100644
index 0000000..dbfc859
--- /dev/null
+++ b/microbenchmarks/suites/__init__.py
@@ -0,0 +1,47 @@
+"""Benchmark suites for microbenchmarks."""
+
+from dataclasses import dataclass
+from typing import Callable
+import pyarrow as pa
+
+
+@dataclass
+class BenchmarkFunction:
+    """Defines a function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column name
+    duckdb_expr: str      # Expression using {col} as placeholder for column name
+
+
+@dataclass
+class Suite:
+    """Defines a benchmark suite."""
+    name: str
+    description: str
+    column_name: str
+    functions: list[BenchmarkFunction]
+    generate_data: Callable[[int, bool], pa.Table]  # (num_rows, use_string_view) -> Table
+
+
+# Import suites to register them
+from . import strings
+from . import temporal
+
+# Registry of available suites
+SUITES: dict[str, Suite] = {
+    'strings': strings.SUITE,
+    'temporal': temporal.SUITE,
+}
+
+
+def get_suite(name: str) -> Suite:
+    """Get a suite by name."""
+    if name not in SUITES:
+        available = ', '.join(SUITES.keys())
+        raise ValueError(f"Unknown suite: {name}. Available: {available}")
+    return SUITES[name]
+
+
+def list_suites() -> list[str]:
+    """List available suite names."""
+    return list(SUITES.keys())
diff --git a/microbenchmarks/suites/strings.py b/microbenchmarks/suites/strings.py
new file mode 100644
index 0000000..05f730e
--- /dev/null
+++ b/microbenchmarks/suites/strings.py
@@ -0,0 +1,74 @@
+"""String functions benchmark suite."""
+
+import random
+import string
+
+import pyarrow as pa
+
+from . import BenchmarkFunction, Suite
+
+
+FUNCTIONS = [
+    BenchmarkFunction("trim", "trim({col})", "trim({col})"),
+    BenchmarkFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+    BenchmarkFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+    BenchmarkFunction("lower", "lower({col})", "lower({col})"),
+    BenchmarkFunction("upper", "upper({col})", "upper({col})"),
+    BenchmarkFunction("length", "length({col})", "length({col})"),
+    BenchmarkFunction("char_length", "char_length({col})", "length({col})"),
+    BenchmarkFunction("reverse", "reverse({col})", "reverse({col})"),
+    BenchmarkFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+    BenchmarkFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+    BenchmarkFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"),
+    BenchmarkFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"),
+    BenchmarkFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+    BenchmarkFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+    BenchmarkFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+    BenchmarkFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+    BenchmarkFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"),
+    BenchmarkFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"),
+    BenchmarkFunction("ascii", "ascii({col})", "ascii({col})"),
+    BenchmarkFunction("md5", "md5({col})", "md5({col})"),
+    BenchmarkFunction("sha256", "sha256({col})", "sha256({col})"),
+    BenchmarkFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+    BenchmarkFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"),
+    BenchmarkFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"),
+    BenchmarkFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"),
+    BenchmarkFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+    BenchmarkFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
+    """Generate test data with various string patterns."""
+    random.seed(42)  # For reproducibility
+
+    strings_data = []
+    for i in range(num_rows):
+        pattern_type = i % 5
+        if pattern_type == 0:
+            s = f"  test_{i % 1000}  "
+        elif pattern_type == 1:
+            s = ''.join(random.choices(string.ascii_lowercase, k=20))
+        elif pattern_type == 2:
+            s = f"TestData_{i}_Value"
+        elif pattern_type == 3:
+            s = f"hello world {i % 100} data"
+        else:
+            length = random.randint(5, 50)
+            s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
+        strings_data.append(s)
+
+    str_type = pa.string_view() if use_string_view else pa.string()
+    return pa.table({
+        'str_col': pa.array(strings_data, type=str_type)
+    })
+
+
+SUITE = Suite(
+    name="strings",
+    description="String function benchmarks",
+    column_name="str_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)
diff --git a/microbenchmarks/suites/temporal.py b/microbenchmarks/suites/temporal.py
new file mode 100644
index 0000000..6f64286
--- /dev/null
+++ b/microbenchmarks/suites/temporal.py
@@ -0,0 +1,91 @@
+"""Temporal (date/time) functions benchmark suite."""
+
+import random
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+
+from . import BenchmarkFunction, Suite
+
+
+FUNCTIONS = [
+    # Date extraction functions
+    BenchmarkFunction("year", "year({col})", "year({col})"),
+    BenchmarkFunction("month", "month({col})", "month({col})"),
+    BenchmarkFunction("day", "day({col})", "day({col})"),
+    BenchmarkFunction("hour", "hour({col})", "hour({col})"),
+    BenchmarkFunction("minute", "minute({col})", "minute({col})"),
+    BenchmarkFunction("second", "second({col})", "second({col})"),
+    BenchmarkFunction("week", "week({col})", "week({col})"),
+    BenchmarkFunction("quarter", "quarter({col})", "quarter({col})"),
+    BenchmarkFunction("day_of_week", "extract(dow from {col})", "dayofweek({col})"),
+    BenchmarkFunction("day_of_year", "extract(doy from {col})", "dayofyear({col})"),
+
+    # Date truncation
+    BenchmarkFunction("date_trunc_day", "date_trunc('day', {col})", "date_trunc('day', {col})"),
+    BenchmarkFunction("date_trunc_month", "date_trunc('month', {col})", "date_trunc('month', {col})"),
+    BenchmarkFunction("date_trunc_year", "date_trunc('year', {col})", "date_trunc('year', {col})"),
+    BenchmarkFunction("date_trunc_hour", "date_trunc('hour', {col})", "date_trunc('hour', {col})"),
+
+    # Date arithmetic
+    BenchmarkFunction("date_add_days", "{col} + interval '7 days'", "{col} + interval '7 days'"),
+    BenchmarkFunction("date_sub_days", "{col} - interval '7 days'", "{col} - interval '7 days'"),
+    BenchmarkFunction("date_add_months", "{col} + interval '1 month'", "{col} + interval '1 month'"),
+
+    # Date formatting/parsing
+    BenchmarkFunction("to_char", "to_char({col}, '%Y-%m-%d')", "strftime({col}, '%Y-%m-%d')"),
+
+    # Date parts
+    BenchmarkFunction("date_part_hour", "date_part('hour', {col})", "date_part('hour', {col})"),
+    BenchmarkFunction("date_part_minute", "date_part('minute', {col})", "date_part('minute', {col})"),
+
+    # Current date/time comparisons
+    BenchmarkFunction("is_past", "{col} < now()", "{col} < now()"),
+]
+
+
+def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
+    """Generate test data with various timestamp patterns."""
+    random.seed(42)  # For reproducibility
+
+    # Generate timestamps spanning several years
+    base_date = datetime(2020, 1, 1)
+    max_days = 365 * 5  # 5 years of data
+
+    timestamps = []
+    for i in range(num_rows):
+        # Mix of different timestamp patterns
+        pattern_type = i % 4
+        if pattern_type == 0:
+            # Random timestamp within range
+            days = random.randint(0, max_days)
+            hours = random.randint(0, 23)
+            minutes = random.randint(0, 59)
+            seconds = random.randint(0, 59)
+            ts = base_date + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
+        elif pattern_type == 1:
+            # Timestamps at midnight (common pattern)
+            days = random.randint(0, max_days)
+            ts = base_date + timedelta(days=days)
+        elif pattern_type == 2:
+            # Timestamps at specific hours (business hours)
+            days = random.randint(0, max_days)
+            hours = random.choice([9, 10, 11, 12, 13, 14, 15, 16, 17])
+            ts = base_date + timedelta(days=days, hours=hours)
+        else:
+            # Sequential timestamps (time series pattern)
+            ts = base_date + timedelta(seconds=i)
+        timestamps.append(ts)
+
+    return pa.table({
+        'ts_col': pa.array(timestamps, type=pa.timestamp('us'))
+    })
+
+
+SUITE = Suite(
+    name="temporal",
+    description="Date/time function benchmarks",
+    column_name="ts_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)

From 87f5bdab1f4da5c23dc5177fee6cb14f4129d408 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:06:31 -0700
Subject: [PATCH 2/6] fix

---
 microbenchmarks/README.md                     |  88 ++++++++++++++++--
 .../__pycache__/__init__.cpython-310.pyc      | Bin 0 -> 1603 bytes
 .../__pycache__/strings.cpython-310.pyc       | Bin 0 -> 2506 bytes
 .../__pycache__/temporal.cpython-310.pyc      | Bin 0 -> 2607 bytes
 microbenchmarks/suites/temporal.py            |  20 ++--
 5 files changed, 90 insertions(+), 18 deletions(-)
 create mode 100644 microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc
 create mode 100644 microbenchmarks/suites/__pycache__/strings.cpython-310.pyc
 create mode 100644 microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc

diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
index d4ddc1f..b086a22 100644
--- a/microbenchmarks/README.md
+++ b/microbenchmarks/README.md
@@ -25,6 +25,15 @@ This directory contains microbenchmarks for comparing DataFusion and DuckDB perf
 
 The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics.
 
+Benchmarks are organized into **suites**, each focusing on a specific category of SQL functions:
+
+| Suite | Description | Functions |
+|-------|-------------|-----------|
+| `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 |
+| `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 |
+
+All benchmarks run in single-threaded mode for fair comparison between engines.
+
 ## Setup
 
 Create a virtual environment and install dependencies:
@@ -41,36 +50,44 @@ pip install -r requirements.txt
 Run a benchmark:
 
 ```shell
-python microbenchmarks.py
+python microbenchmarks.py --suite strings
 ```
 
 ### Options
 
 | Option | Default | Description |
 |--------|---------|-------------|
+| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`) |
 | `--rows` | `1000000` | Number of rows in the generated test data |
 | `--warmup` | `2` | Number of warmup iterations before timing |
 | `--iterations` | `5` | Number of timed iterations (results are averaged) |
 | `--output` | stdout | Output file path for markdown results |
+| `--string-view` | `false` | Use Arrow StringView type instead of String |
 
 ### Examples
 
-Run the benchmark with default settings:
+Run the string functions benchmark (default):
 
 ```shell
-python microbenchmark.py
+python microbenchmarks.py
 ```
 
-Run the benchmark with 10 million rows:
+Run the temporal functions benchmark:
 
 ```shell
-python microbenchmarks.py --rows 10000000
+python microbenchmarks.py --suite temporal
 ```
 
-Run the benchmark and save results to a file:
+Run with 10 million rows:
 
 ```shell
-python microbenchmarks.py --output results.md
+python microbenchmarks.py --suite strings --rows 10000000
+```
+
+Run with StringView type and save results:
+
+```shell
+python microbenchmarks.py --suite strings --string-view --output results.md
 ```
 
 ## Output
@@ -83,4 +100,59 @@ The benchmark outputs a markdown table comparing execution times:
 | lower | 8.90 | 7.50 | 1.19x | DuckDB |
 | ... | ... | ... | ... | ... |
 
-A summary section shows overall statistics including how many functions each engine won and total execution times.
\ No newline at end of file
+A summary section shows overall statistics including how many functions each engine won and total execution times.
+
+## Project Structure
+
+```
+microbenchmarks/
+├── microbenchmarks.py      # Main benchmark runner
+├── requirements.txt        # Python dependencies
+└── suites/                 # Benchmark suite definitions
+    ├── __init__.py         # Suite registry and base classes
+    ├── strings.py          # String function benchmarks
+    └── temporal.py         # Date/time function benchmarks
+```
+
+## Adding New Suites
+
+To add a new benchmark suite:
+
+1. Create a new file in `suites/` (e.g., `suites/numeric.py`)
+
+2. Define your functions and data generator:
+
+```python
+from . import BenchmarkFunction, Suite
+import pyarrow as pa
+
+FUNCTIONS = [
+    BenchmarkFunction("abs", "abs({col})", "abs({col})"),
+    BenchmarkFunction("sqrt", "sqrt({col})", "sqrt({col})"),
+    # ... more functions
+]
+
+def generate_data(num_rows: int, use_string_view: bool = False) -> pa.Table:
+    # Generate appropriate test data
+    return pa.table({'num_col': pa.array(...)})
+
+SUITE = Suite(
+    name="numeric",
+    description="Numeric function benchmarks",
+    column_name="num_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)
+```
+
+3. Register the suite in `suites/__init__.py`:
+
+```python
+from . import numeric
+
+SUITES: dict[str, Suite] = {
+    'strings': strings.SUITE,
+    'temporal': temporal.SUITE,
+    'numeric': numeric.SUITE,  # Add new suite here
+}
+```
\ No newline at end of file
diff --git a/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc b/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29c00ebb00c9cde6401dd62130527804bf8df5b2
GIT binary patch
literal 1603
zcma)6O>Y}F5GA>kwED7S#|i4BEfz>~P|-+5kAYF7b<?0g;EUrP_CiogN?v=ltCCb~
z3GSuNzeo!tw-!D0|M1#V{z4B$XI7Hs1T9bkoaM}r%bEAyZqaTx2#ky0kJCRmA%CKC
zb3lwdhG~BVb4(P8iK2>Sbi!graUEpAgvY#G^Q;nAHG4`_MO9xCRlQ<YG_IAAnhGHj
zmXT1^RU@lU8gT>8)m0PL&A3U4Zk)Ds^R%T}KeM>4w{%B`Beq9$SGD(v0$*W-tFGF9
z$>VMCc64WtjF{R{w;*y$@gvgPT|D?&=gG-LnzP8xQm1V+Doiv<6H^S=8QXsgO_3h;
zbtRolGHES1;epIDIn4B7g6W$h0Ov7G`zaVrVhVULK!wJE=DMQTJ}IBU>dB$z3e;2e
z+t>AcJIfQ77Wv}yZ}ce5p)whbR<Y<jbtjRX=T2Uf1rH1Gbw0*@ea~|_(Y}Mlj%F6#
zA@s%6_@<gAXKGkFhj3RY1a1_<H-wlJYL?-;CBzRinXNK4Aykn-IIzz6ju0}>3n#HK
zE5vWaY(a;m`Rj0SQcUzf=4w6|r*0spGC9$MO%)F}hz{OEc(4S2AViv{PKf?=ZctJo
zf{4m%5Y9px(iSb_|Bd)F*!=$x*Hv8u#Z*)USO|4pH^vM#Rq6<I37ph`lh9X=P#KH6
z8y>D3S&F7_DsU^{t8HS^DT3phNs-Mad5N&EuZgq1J=VE4(rJOf9QN7<do*=4*r;iu
zX`yL@@jOcbdBnn?s(g^<&hueWWS&i>uRKQuZdh2ZI=H<H2B$@e*7zZUcWIe>OCb(A
zuEG1$d|XP}>B+P(GV3vO53=S%Fur<p{N3{>N6RLwM!T8G6Fj^g3>JvM0ZjV^m`mcw
z3u<mF`kFfOn!O=!=#SI|id~XdL<N`h75Rl<(VrlDN%jc9%on>)wF8ZpN{EK@2o>00
zu+O4J=Quyhi}QS$dlW6&{phO~GA+NpM^TTPkKj_zPm477{7lcSZ+tJanSNqSVW5|i
zp1NEgxd9g<<?CbZM0xK6>;MS~*%sXey_x&qBrx9@0eA;6E%M}uquAmVg*NEo-ZLO9
zy1r}G4>n`_y};at6J@z3!qxTXvmwqmk@96d_91Mth;NITPvFtpEd>HvTmOl)HJJBR
zH=hEHOO@27a|ykln-3thR1ZqQlN+>FN?KRYsWfpL)s<Z@OT|n7waRdR`OEkcm0&*x
cQ-va1fEf$Al-0Vl)o4-9Ilc4uPW4Xs4;=7))c^nh

literal 0
HcmV?d00001

diff --git a/microbenchmarks/suites/__pycache__/strings.cpython-310.pyc b/microbenchmarks/suites/__pycache__/strings.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ee0c29a5dc2d5bd7555963f03cb90dd8f9c7e01
GIT binary patch
literal 2506
zcmb7FOLNpl5SFxh?As5FF@}IJkF^}*^#eml2q8QS973vML!~xFsbuwdS4Jx-Gg^$5
z^@&{h1*ufo$5j5&TvMsa-f~V3Nzd$qO;rwwRGR*#zn<1i_e@V*sT37_-u(PLxKmM-
zUqom8xqzK}__0V;6i^xpsGx;vLzPl%Xj0}HI^;kPjkeh^+g8I;6=H$`CYWz?h%CrM
zUZMgNp(s%a%21Z50#&F=Gy#(^DbW;656~%?ff=bg4YM#S(HWS7If>4~Jj_dU4$i}Q
zi9Ub}a6zJrV1q5uC0Kw3i7vwxxFXS2xCYlGs=*>GO60&2EJ^esEW@%y*I@-#B&x$I
ztVy&E8?YhK4cLTDiEhFzxFyj?a2sw*^f7z_ci!fj=Nfr<0-uor+#^N!oap<dJrnN3
zgYS(-nON`;9>EsLlQKMhXEZ9HZ7a^x?9z5hgQ#imbs|3v;)vP1B=TErkG`^5CrHUk
zPbBME50j%f8lSp9q_+kmxUntlxQ>q6SxK~u-}!NP<ZvUDy0R1_CG9Jv7%9W}fY7Wg
z#Yovn5}}l0s5C<oHPcpB>C2(I<hMNP4!G&D&d|W3<Tas;WYqx}5_w7z;-&6JHrYpg
zV%gqsxar4{@1@yHUk;hU@(O*ed%&{uBe+<*UR$=M#XvZ`!aBPQR~<FoHFq<c9d(Be
z*Ou)~{98!&Qsk>b4k^@IG-$R2F(JV?6DA(G>#Ny}07GJZb=j^h)x^LdEx^CfBndsA
zWOKN10frVe50_B;8a)(J>P0N{Qj#r<;CnL?#2sO}w!U#=6U_~e`9YACrRbYEJlBTJ
ztccA}VX~IDzIijN^yN@(?8=QXCB^6;cFoBq1u?GSMV5p?>Lwmdv$OB<n9Z`i=I|0r
z(So@L0q&UjvChaaB`g)|BoPeE=0<R21YYW)k%>7>VwO$!<%lN8;jU_uH;FrlWOntR
z{gLfAo`~5?5j4zDw)9`j(6(7~diU;6C|f)qb=oeC4;Y{5Fyaoj)$4#9aFdd>LnB`k
zCwB1UQCj#tz>ocoO{yGYWj#?<rKucgsn%8Ast=WyFOG8E9B8x3k>15h`%0r1Ku3)+
ztNfsV`D5<Li0&d|A+rMc<6Nrj>#4DC0@mAi+KGz%3i2i7Z$r78!%-DE>N0ApsNI4I
z^fxK}O-W`J=VE~d(<j=IIp#BuehX(5ICVzpnnUkh?YP)A#Js;PbyeC*OULCCCCZ^R
zPLv}HWd&x^(tfq8buBpkP783ASmS21)yE_v6!)+#vTciO+G0o8uRV&%V0J%<cH*Te
zp%Gi@En%Q~1~ycbz09)ZNH_Q`hds@<sR%JwdNU{-m8-9MQc>7J{}0i<%T4!%7j{Sw
z2Pj#&MZz$)4`LdE-2<UA<TQHgi2qP+Zs8PMESl&{PY)B~Okg&>2;w%^86m(8>@~5-
z?#DsIEx#29K4Ef4VpZW2^1!*WdiowCc=B;@5T(2_V2D^64uGH;q$oWstr2i7@x&pK
zrAh7~BlUJef(MyW@6gH9OE^KgfDJGA&6q3V$JKzB2k~;ngt^M7xaWCZtVC#jQs|ag
z-52#%+$MFL&tV<^UH1~tZ;`t2ve#jF<y;@#IIP|dd>W7S<rT#0gI}*ChrH4pBvZ~q
zB#K?brl{#!S<T}+jo%!8Rf)@*t(MgpO~=0Z7kwh8D1wbqW7+(+1f$FE{oTjtDq1=Q
z*CVe@cnJveX^@B;l9w<~op$6(rZBoVrEf>0HV)i!s@!s2h<(?kBFFR=Hrz@MF|P3e
z{Zy#Lw5ix1R2O7n>-n<>J5RrSw#{=QbNX%^huqi^`RLEv*na+W=aG!+So2s6&fjUr
m&<V+1v0vF&*yu`L!*FPmT2U{m|Nqs0oxiBfYwGYV{`oiHVe{bt

literal 0
HcmV?d00001

diff --git a/microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc b/microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9491c06d187e10488b02a3d866119afc3ac39d1b
GIT binary patch
literal 2607
zcmZuyOK;mo5av^)-f~_}oJW~9j?5&s@@Sf%NmDlu2f4UFZ67M2AZX3nVj_~tauwIA
zbaJo$4@qwY`iu5dpuOlXw1;+fNm&#U5n6sT{APAIGs}sq)slk0oj-O0r>-b}2_gSG
zkHR|sldn}p0i~^g3R+KXt9aLYTBx<PP;cvMsyEsu7-04aq1CoRyKSoqv7i7J*he~|
zB9x#kQ3a|{lc)|2Sdi!xoQ5+JorQC-DA9Si080{GgiCN)qATzjI1*ij&*2M+uE8=a
zOLQHY(3EHeTyQ120XN~MM7LlSRwY`47PKU~4R_#<M0epHd@0d=jH)5gR~Xf4i5|d1
zczC4mpKTZ6IXogIcudOh4KZI=_6qO>zWvo~SBVYl@EtrQMN)-l$7Z_*+Lq$J7`C=a
z*pI2-b5_7-q{V`exX#`n>aZY=5@(l0oo?vUH%>AL7+IT$GHkjUw*_6Wc~PuDdd#0-
zH57cd@N{amkvZb#mN4hWA@S+(ys-89K4mLSp=jQ6e(A)$u{$gaaU~VF8O9Oo4lm@`
zQZu7hq&(&8;2#bbb8M7n+!9JtrqPWD^I-_Zj9V7slw1gdXu!yDDd#y=%?zrk;>08&
zoj8IylT?*6NflEQ<A9JibD0r}8NDpTDcO2Ez~m7+yqJqA)6Gn3nPzHL#teAzo+o#j
z%Ts(XvuYqK-plVcwUf1*x69=>waeG9CWB)Xd%#1d*~>|JrHP7@JvrK9L!`{K=xMSu
z?tImGJ;#<Yr5%!yg=XeC&y_`=<IYu`SEXMcKt!7iFUU4@ZaP7PseXq?r1`)RPnsDs
z8SKt6S3ki__ZL3pOC-akkG<Y;(ih|wi@i=456Dsm(#qX(n#(_~E{Chj&~%3v6Gr#M
zk^96$o~B%!EKj^!mRKpL9m=gB!IPJ;VKoipu@l7yEAC|dX<gaib~Fe*8XqLQK1hg%
zTLsa+_bwm@yg&&X(5Rz{FF^LI;N8amGyEqFJpD|0kKf=As;V#zG|)jm(#IMYV|`y4
z8%!T5N9uFsCo(oiCYa}xabZ+|!W)gQqoo0bk^Vs&TPX$$v7KQXaWRKWBaN9OYh;g#
zqY@O4jYVZ#j_#p19liaK_m(oDbgW9wRpgYB)5~)Z7xopX{D!fQ?Wu359&6IZk{T0g
zsgIHI-Wrv~TfRNb?Ds{eGy8REWFRg><AVwd$6A1%@C$otc<&{N2*v((7)cm55c8db
zfOVaBJ`LhQ;)q%%%n$oc-)GoqQL;9<fe8)%wiJaL0$oKk5r~Q}>|{|ak8p7fZitTL
zrl@wpt+dDp(vM5xs`Q#5i=w&3Nl{=iS|??(i1JOUVyTG*C#^DRh-Cp6SCgi6Szwhm
z@x}`*@$f^<wE=qO20QE%w}PGFM-YeHNC*Kcw&EtL_6fJd6Ffdtm)4zb9CQdT`29YK
zfNOo9o6O(s5pMdF`iJQ#k-2ay^fKDeE5fk2>ti#D)5-17|5+Y7UOLHJ!qtFRvgCOp
zs9EIny6_@{*I4qqLSLZ#&t|I|hoptU9k%uZ*7Ey)r%PI*HhY5v2jA+++)G+v(4lco
zmV+>9v21Xz^$&S<KdZ3_(r@7w$!%OpT17WhL#wE^W~&vvi|d>!a6_&&^^{uEE66#c
z7XG6z(TLkq&%?{l@s;IBKAFg~ft=90$aYO`L_SX8GLWQ0gT5Hsyo^N}gpnto6i=qP
z^pADDgoYb#om-v<amVxMeN@Q8rG>KkhggsJfQlaD3N^*!g22U%oy}+4ufE^h;yTuq
z8@n;KgSjn=m_~1I?Y!E4A=8{&HiWVLIE-P?BTqyVCqiW?wn&85&`M@W{piv&O?uZ$
S;;tKiE?v{EYwGkW{rf)~hYptj

literal 0
HcmV?d00001

diff --git a/microbenchmarks/suites/temporal.py b/microbenchmarks/suites/temporal.py
index 6f64286..3e1c02e 100644
--- a/microbenchmarks/suites/temporal.py
+++ b/microbenchmarks/suites/temporal.py
@@ -10,16 +10,16 @@
 
 FUNCTIONS = [
     # Date extraction functions
-    BenchmarkFunction("year", "year({col})", "year({col})"),
-    BenchmarkFunction("month", "month({col})", "month({col})"),
-    BenchmarkFunction("day", "day({col})", "day({col})"),
-    BenchmarkFunction("hour", "hour({col})", "hour({col})"),
-    BenchmarkFunction("minute", "minute({col})", "minute({col})"),
-    BenchmarkFunction("second", "second({col})", "second({col})"),
-    BenchmarkFunction("week", "week({col})", "week({col})"),
-    BenchmarkFunction("quarter", "quarter({col})", "quarter({col})"),
-    BenchmarkFunction("day_of_week", "extract(dow from {col})", "dayofweek({col})"),
-    BenchmarkFunction("day_of_year", "extract(doy from {col})", "dayofyear({col})"),
+    BenchmarkFunction("year", "date_part('year', {col})", "year({col})"),
+    BenchmarkFunction("month", "date_part('month', {col})", "month({col})"),
+    BenchmarkFunction("day", "date_part('day', {col})", "day({col})"),
+    BenchmarkFunction("hour", "date_part('hour', {col})", "hour({col})"),
+    BenchmarkFunction("minute", "date_part('minute', {col})", "minute({col})"),
+    BenchmarkFunction("second", "date_part('second', {col})", "second({col})"),
+    BenchmarkFunction("week", "date_part('week', {col})", "week({col})"),
+    BenchmarkFunction("quarter", "date_part('quarter', {col})", "quarter({col})"),
+    BenchmarkFunction("day_of_week", "date_part('dow', {col})", "dayofweek({col})"),
+    BenchmarkFunction("day_of_year", "date_part('doy', {col})", "dayofyear({col})"),
 
     # Date truncation
     BenchmarkFunction("date_trunc_day", "date_trunc('day', {col})", "date_trunc('day', {col})"),

From f49e4fc6358e6f914b081a816c36848f9139a21f Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:07:09 -0700
Subject: [PATCH 3/6] delete cache

---
 .../suites/__pycache__/__init__.cpython-310.pyc  | Bin 1603 -> 0 bytes
 .../suites/__pycache__/strings.cpython-310.pyc   | Bin 2506 -> 0 bytes
 .../suites/__pycache__/temporal.cpython-310.pyc  | Bin 2607 -> 0 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc
 delete mode 100644 microbenchmarks/suites/__pycache__/strings.cpython-310.pyc
 delete mode 100644 microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc

diff --git a/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc b/microbenchmarks/suites/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 29c00ebb00c9cde6401dd62130527804bf8df5b2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1603
zcma)6O>Y}F5GA>kwED7S#|i4BEfz>~P|-+5kAYF7b<?0g;EUrP_CiogN?v=ltCCb~
z3GSuNzeo!tw-!D0|M1#V{z4B$XI7Hs1T9bkoaM}r%bEAyZqaTx2#ky0kJCRmA%CKC
zb3lwdhG~BVb4(P8iK2>Sbi!graUEpAgvY#G^Q;nAHG4`_MO9xCRlQ<YG_IAAnhGHj
zmXT1^RU@lU8gT>8)m0PL&A3U4Zk)Ds^R%T}KeM>4w{%B`Beq9$SGD(v0$*W-tFGF9
z$>VMCc64WtjF{R{w;*y$@gvgPT|D?&=gG-LnzP8xQm1V+Doiv<6H^S=8QXsgO_3h;
zbtRolGHES1;epIDIn4B7g6W$h0Ov7G`zaVrVhVULK!wJE=DMQTJ}IBU>dB$z3e;2e
z+t>AcJIfQ77Wv}yZ}ce5p)whbR<Y<jbtjRX=T2Uf1rH1Gbw0*@ea~|_(Y}Mlj%F6#
zA@s%6_@<gAXKGkFhj3RY1a1_<H-wlJYL?-;CBzRinXNK4Aykn-IIzz6ju0}>3n#HK
zE5vWaY(a;m`Rj0SQcUzf=4w6|r*0spGC9$MO%)F}hz{OEc(4S2AViv{PKf?=ZctJo
zf{4m%5Y9px(iSb_|Bd)F*!=$x*Hv8u#Z*)USO|4pH^vM#Rq6<I37ph`lh9X=P#KH6
z8y>D3S&F7_DsU^{t8HS^DT3phNs-Mad5N&EuZgq1J=VE4(rJOf9QN7<do*=4*r;iu
zX`yL@@jOcbdBnn?s(g^<&hueWWS&i>uRKQuZdh2ZI=H<H2B$@e*7zZUcWIe>OCb(A
zuEG1$d|XP}>B+P(GV3vO53=S%Fur<p{N3{>N6RLwM!T8G6Fj^g3>JvM0ZjV^m`mcw
z3u<mF`kFfOn!O=!=#SI|id~XdL<N`h75Rl<(VrlDN%jc9%on>)wF8ZpN{EK@2o>00
zu+O4J=Quyhi}QS$dlW6&{phO~GA+NpM^TTPkKj_zPm477{7lcSZ+tJanSNqSVW5|i
zp1NEgxd9g<<?CbZM0xK6>;MS~*%sXey_x&qBrx9@0eA;6E%M}uquAmVg*NEo-ZLO9
zy1r}G4>n`_y};at6J@z3!qxTXvmwqmk@96d_91Mth;NITPvFtpEd>HvTmOl)HJJBR
zH=hEHOO@27a|ykln-3thR1ZqQlN+>FN?KRYsWfpL)s<Z@OT|n7waRdR`OEkcm0&*x
cQ-va1fEf$Al-0Vl)o4-9Ilc4uPW4Xs4;=7))c^nh

diff --git a/microbenchmarks/suites/__pycache__/strings.cpython-310.pyc b/microbenchmarks/suites/__pycache__/strings.cpython-310.pyc
deleted file mode 100644
index 1ee0c29a5dc2d5bd7555963f03cb90dd8f9c7e01..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2506
zcmb7FOLNpl5SFxh?As5FF@}IJkF^}*^#eml2q8QS973vML!~xFsbuwdS4Jx-Gg^$5
z^@&{h1*ufo$5j5&TvMsa-f~V3Nzd$qO;rwwRGR*#zn<1i_e@V*sT37_-u(PLxKmM-
zUqom8xqzK}__0V;6i^xpsGx;vLzPl%Xj0}HI^;kPjkeh^+g8I;6=H$`CYWz?h%CrM
zUZMgNp(s%a%21Z50#&F=Gy#(^DbW;656~%?ff=bg4YM#S(HWS7If>4~Jj_dU4$i}Q
zi9Ub}a6zJrV1q5uC0Kw3i7vwxxFXS2xCYlGs=*>GO60&2EJ^esEW@%y*I@-#B&x$I
ztVy&E8?YhK4cLTDiEhFzxFyj?a2sw*^f7z_ci!fj=Nfr<0-uor+#^N!oap<dJrnN3
zgYS(-nON`;9>EsLlQKMhXEZ9HZ7a^x?9z5hgQ#imbs|3v;)vP1B=TErkG`^5CrHUk
zPbBME50j%f8lSp9q_+kmxUntlxQ>q6SxK~u-}!NP<ZvUDy0R1_CG9Jv7%9W}fY7Wg
z#Yovn5}}l0s5C<oHPcpB>C2(I<hMNP4!G&D&d|W3<Tas;WYqx}5_w7z;-&6JHrYpg
zV%gqsxar4{@1@yHUk;hU@(O*ed%&{uBe+<*UR$=M#XvZ`!aBPQR~<FoHFq<c9d(Be
z*Ou)~{98!&Qsk>b4k^@IG-$R2F(JV?6DA(G>#Ny}07GJZb=j^h)x^LdEx^CfBndsA
zWOKN10frVe50_B;8a)(J>P0N{Qj#r<;CnL?#2sO}w!U#=6U_~e`9YACrRbYEJlBTJ
ztccA}VX~IDzIijN^yN@(?8=QXCB^6;cFoBq1u?GSMV5p?>Lwmdv$OB<n9Z`i=I|0r
z(So@L0q&UjvChaaB`g)|BoPeE=0<R21YYW)k%>7>VwO$!<%lN8;jU_uH;FrlWOntR
z{gLfAo`~5?5j4zDw)9`j(6(7~diU;6C|f)qb=oeC4;Y{5Fyaoj)$4#9aFdd>LnB`k
zCwB1UQCj#tz>ocoO{yGYWj#?<rKucgsn%8Ast=WyFOG8E9B8x3k>15h`%0r1Ku3)+
ztNfsV`D5<Li0&d|A+rMc<6Nrj>#4DC0@mAi+KGz%3i2i7Z$r78!%-DE>N0ApsNI4I
z^fxK}O-W`J=VE~d(<j=IIp#BuehX(5ICVzpnnUkh?YP)A#Js;PbyeC*OULCCCCZ^R
zPLv}HWd&x^(tfq8buBpkP783ASmS21)yE_v6!)+#vTciO+G0o8uRV&%V0J%<cH*Te
zp%Gi@En%Q~1~ycbz09)ZNH_Q`hds@<sR%JwdNU{-m8-9MQc>7J{}0i<%T4!%7j{Sw
z2Pj#&MZz$)4`LdE-2<UA<TQHgi2qP+Zs8PMESl&{PY)B~Okg&>2;w%^86m(8>@~5-
z?#DsIEx#29K4Ef4VpZW2^1!*WdiowCc=B;@5T(2_V2D^64uGH;q$oWstr2i7@x&pK
zrAh7~BlUJef(MyW@6gH9OE^KgfDJGA&6q3V$JKzB2k~;ngt^M7xaWCZtVC#jQs|ag
z-52#%+$MFL&tV<^UH1~tZ;`t2ve#jF<y;@#IIP|dd>W7S<rT#0gI}*ChrH4pBvZ~q
zB#K?brl{#!S<T}+jo%!8Rf)@*t(MgpO~=0Z7kwh8D1wbqW7+(+1f$FE{oTjtDq1=Q
z*CVe@cnJveX^@B;l9w<~op$6(rZBoVrEf>0HV)i!s@!s2h<(?kBFFR=Hrz@MF|P3e
z{Zy#Lw5ix1R2O7n>-n<>J5RrSw#{=QbNX%^huqi^`RLEv*na+W=aG!+So2s6&fjUr
m&<V+1v0vF&*yu`L!*FPmT2U{m|Nqs0oxiBfYwGYV{`oiHVe{bt

diff --git a/microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc b/microbenchmarks/suites/__pycache__/temporal.cpython-310.pyc
deleted file mode 100644
index 9491c06d187e10488b02a3d866119afc3ac39d1b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2607
zcmZuyOK;mo5av^)-f~_}oJW~9j?5&s@@Sf%NmDlu2f4UFZ67M2AZX3nVj_~tauwIA
zbaJo$4@qwY`iu5dpuOlXw1;+fNm&#U5n6sT{APAIGs}sq)slk0oj-O0r>-b}2_gSG
zkHR|sldn}p0i~^g3R+KXt9aLYTBx<PP;cvMsyEsu7-04aq1CoRyKSoqv7i7J*he~|
zB9x#kQ3a|{lc)|2Sdi!xoQ5+JorQC-DA9Si080{GgiCN)qATzjI1*ij&*2M+uE8=a
zOLQHY(3EHeTyQ120XN~MM7LlSRwY`47PKU~4R_#<M0epHd@0d=jH)5gR~Xf4i5|d1
zczC4mpKTZ6IXogIcudOh4KZI=_6qO>zWvo~SBVYl@EtrQMN)-l$7Z_*+Lq$J7`C=a
z*pI2-b5_7-q{V`exX#`n>aZY=5@(l0oo?vUH%>AL7+IT$GHkjUw*_6Wc~PuDdd#0-
zH57cd@N{amkvZb#mN4hWA@S+(ys-89K4mLSp=jQ6e(A)$u{$gaaU~VF8O9Oo4lm@`
zQZu7hq&(&8;2#bbb8M7n+!9JtrqPWD^I-_Zj9V7slw1gdXu!yDDd#y=%?zrk;>08&
zoj8IylT?*6NflEQ<A9JibD0r}8NDpTDcO2Ez~m7+yqJqA)6Gn3nPzHL#teAzo+o#j
z%Ts(XvuYqK-plVcwUf1*x69=>waeG9CWB)Xd%#1d*~>|JrHP7@JvrK9L!`{K=xMSu
z?tImGJ;#<Yr5%!yg=XeC&y_`=<IYu`SEXMcKt!7iFUU4@ZaP7PseXq?r1`)RPnsDs
z8SKt6S3ki__ZL3pOC-akkG<Y;(ih|wi@i=456Dsm(#qX(n#(_~E{Chj&~%3v6Gr#M
zk^96$o~B%!EKj^!mRKpL9m=gB!IPJ;VKoipu@l7yEAC|dX<gaib~Fe*8XqLQK1hg%
zTLsa+_bwm@yg&&X(5Rz{FF^LI;N8amGyEqFJpD|0kKf=As;V#zG|)jm(#IMYV|`y4
z8%!T5N9uFsCo(oiCYa}xabZ+|!W)gQqoo0bk^Vs&TPX$$v7KQXaWRKWBaN9OYh;g#
zqY@O4jYVZ#j_#p19liaK_m(oDbgW9wRpgYB)5~)Z7xopX{D!fQ?Wu359&6IZk{T0g
zsgIHI-Wrv~TfRNb?Ds{eGy8REWFRg><AVwd$6A1%@C$otc<&{N2*v((7)cm55c8db
zfOVaBJ`LhQ;)q%%%n$oc-)GoqQL;9<fe8)%wiJaL0$oKk5r~Q}>|{|ak8p7fZitTL
zrl@wpt+dDp(vM5xs`Q#5i=w&3Nl{=iS|??(i1JOUVyTG*C#^DRh-Cp6SCgi6Szwhm
z@x}`*@$f^<wE=qO20QE%w}PGFM-YeHNC*Kcw&EtL_6fJd6Ffdtm)4zb9CQdT`29YK
zfNOo9o6O(s5pMdF`iJQ#k-2ay^fKDeE5fk2>ti#D)5-17|5+Y7UOLHJ!qtFRvgCOp
zs9EIny6_@{*I4qqLSLZ#&t|I|hoptU9k%uZ*7Ey)r%PI*HhY5v2jA+++)G+v(4lco
zmV+>9v21Xz^$&S<KdZ3_(r@7w$!%OpT17WhL#wE^W~&vvi|d>!a6_&&^^{uEE66#c
z7XG6z(TLkq&%?{l@s;IBKAFg~ft=90$aYO`L_SX8GLWQ0gT5Hsyo^N}gpnto6i=qP
z^pADDgoYb#om-v<amVxMeN@Q8rG>KkhggsJfQlaD3N^*!g22U%oy}+4ufE^h;yTuq
z8@n;KgSjn=m_~1I?Y!E4A=8{&HiWVLIE-P?BTqyVCqiW?wn&85&`M@W{piv&O?uZ$
S;;tKiE?v{EYwGkW{rf)~hYptj


From 319a951337f80059ba6cff08f59bb98773eac1cc Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:07:13 -0700
Subject: [PATCH 4/6] delete cache

---
 microbenchmarks/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 microbenchmarks/.gitignore

diff --git a/microbenchmarks/.gitignore b/microbenchmarks/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/microbenchmarks/.gitignore
@@ -0,0 +1 @@
+__pycache__

From 97c22304cc743566ccb7d549de6f34a6d0e3e0e5 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:11:59 -0700
Subject: [PATCH 5/6] numeric suite

---
 microbenchmarks/README.md          |   3 +-
 microbenchmarks/suites/__init__.py |   2 +
 microbenchmarks/suites/numeric.py  | 104 +++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 microbenchmarks/suites/numeric.py

diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
index b086a22..1001816 100644
--- a/microbenchmarks/README.md
+++ b/microbenchmarks/README.md
@@ -31,6 +31,7 @@ Benchmarks are organized into **suites**, each focusing on a specific category o
 |-------|-------------|-----------|
 | `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 |
 | `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 |
+| `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 |
 
 All benchmarks run in single-threaded mode for fair comparison between engines.
 
@@ -57,7 +58,7 @@ python microbenchmarks.py --suite strings
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`) |
+| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`) |
 | `--rows` | `1000000` | Number of rows in the generated test data |
 | `--warmup` | `2` | Number of warmup iterations before timing |
 | `--iterations` | `5` | Number of timed iterations (results are averaged) |
diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py
index dbfc859..9527f1a 100644
--- a/microbenchmarks/suites/__init__.py
+++ b/microbenchmarks/suites/__init__.py
@@ -26,11 +26,13 @@ class Suite:
 # Import suites to register them
 from . import strings
 from . import temporal
+from . import numeric
 
 # Registry of available suites
 SUITES: dict[str, Suite] = {
     'strings': strings.SUITE,
     'temporal': temporal.SUITE,
+    'numeric': numeric.SUITE,
 }
 
 
diff --git a/microbenchmarks/suites/numeric.py b/microbenchmarks/suites/numeric.py
new file mode 100644
index 0000000..9ad4ac4
--- /dev/null
+++ b/microbenchmarks/suites/numeric.py
@@ -0,0 +1,104 @@
+"""Numeric/math functions benchmark suite."""
+
+import random
+
+import pyarrow as pa
+
+from . import BenchmarkFunction, Suite
+
+
+FUNCTIONS = [
+    # Basic math
+    BenchmarkFunction("abs", "abs({col})", "abs({col})"),
+    BenchmarkFunction("ceil", "ceil({col})", "ceil({col})"),
+    BenchmarkFunction("floor", "floor({col})", "floor({col})"),
+    BenchmarkFunction("round", "round({col}, 2)", "round({col}, 2)"),
+    BenchmarkFunction("trunc", "trunc({col})", "trunc({col})"),
+    BenchmarkFunction("signum", "signum({col})", "sign({col})"),
+
+    # Powers and roots
+    BenchmarkFunction("sqrt", "sqrt(abs({col}))", "sqrt(abs({col}))"),
+    BenchmarkFunction("cbrt", "cbrt({col})", "cbrt({col})"),
+    BenchmarkFunction("power", "power({col}, 2)", "power({col}, 2)"),
+    BenchmarkFunction("exp", "exp({col} / 100)", "exp({col} / 100)"),
+
+    # Logarithms
+    BenchmarkFunction("ln", "ln(abs({col}) + 1)", "ln(abs({col}) + 1)"),
+    BenchmarkFunction("log10", "log10(abs({col}) + 1)", "log10(abs({col}) + 1)"),
+    BenchmarkFunction("log2", "log2(abs({col}) + 1)", "log2(abs({col}) + 1)"),
+    BenchmarkFunction("log", "log(2, abs({col}) + 1)", "log(2, abs({col}) + 1)"),
+
+    # Trigonometric
+    BenchmarkFunction("sin", "sin({col})", "sin({col})"),
+    BenchmarkFunction("cos", "cos({col})", "cos({col})"),
+    BenchmarkFunction("tan", "tan({col})", "tan({col})"),
+    BenchmarkFunction("asin", "asin(sin({col}))", "asin(sin({col}))"),
+    BenchmarkFunction("acos", "acos(cos({col}))", "acos(cos({col}))"),
+    BenchmarkFunction("atan", "atan({col})", "atan({col})"),
+    BenchmarkFunction("atan2", "atan2({col}, {col} + 1)", "atan2({col}, {col} + 1)"),
+
+    # Hyperbolic
+    BenchmarkFunction("sinh", "sinh({col} / 100)", "sinh({col} / 100)"),
+    BenchmarkFunction("cosh", "cosh({col} / 100)", "cosh({col} / 100)"),
+    BenchmarkFunction("tanh", "tanh({col})", "tanh({col})"),
+
+    # Other math functions
+    BenchmarkFunction("degrees", "degrees({col})", "degrees({col})"),
+    BenchmarkFunction("radians", "radians({col})", "radians({col})"),
+    BenchmarkFunction("pi", "pi() * {col}", "pi() * {col}"),
+    BenchmarkFunction("mod", "CAST({col} AS BIGINT) % 7", "CAST({col} AS BIGINT) % 7"),
+    BenchmarkFunction("gcd", "gcd(CAST({col} AS BIGINT), 12)", "gcd(CAST({col} AS BIGINT), 12)"),
+    BenchmarkFunction("lcm", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)", "lcm(CAST(abs({col}) AS BIGINT) % 1000 + 1, 12)"),
+    BenchmarkFunction("factorial", "factorial(CAST(abs({col}) AS BIGINT) % 20)", "factorial(CAST(abs({col}) AS INTEGER) % 20)"),
+
+    # Comparison
+    BenchmarkFunction("greatest", "greatest({col}, {col} * 2, 0)", "greatest({col}, {col} * 2, 0)"),
+    BenchmarkFunction("least", "least({col}, {col} * 2, 0)", "least({col}, {col} * 2, 0)"),
+
+    # Null handling with numeric
+    BenchmarkFunction("coalesce", "coalesce({col}, 0)", "coalesce({col}, 0)"),
+    BenchmarkFunction("nullif", "nullif({col}, 0)", "nullif({col}, 0)"),
+
+    # Bitwise (on integers)
+    BenchmarkFunction("bit_and", "CAST({col} AS BIGINT) & 255", "CAST({col} AS BIGINT) & 255"),
+    BenchmarkFunction("bit_or", "CAST({col} AS BIGINT) | 255", "CAST({col} AS BIGINT) | 255"),
+    BenchmarkFunction("bit_xor", "CAST({col} AS BIGINT) ^ 255", "xor(CAST({col} AS BIGINT), 255)"),
+]
+
+
+def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
+    """Generate test data with various numeric patterns."""
+    random.seed(42)  # For reproducibility
+
+    values = []
+    for i in range(num_rows):
+        pattern_type = i % 5
+        if pattern_type == 0:
+            # Small integers
+            v = random.randint(-100, 100)
+        elif pattern_type == 1:
+            # Larger integers
+            v = random.randint(-10000, 10000)
+        elif pattern_type == 2:
+            # Floating point values
+            v = random.uniform(-1000, 1000)
+        elif pattern_type == 3:
+            # Small decimals
+            v = random.uniform(-1, 1)
+        else:
+            # Mixed range
+            v = random.gauss(0, 500)
+        values.append(v)
+
+    return pa.table({
+        'num_col': pa.array(values, type=pa.float64())
+    })
+
+
+SUITE = Suite(
+    name="numeric",
+    description="Numeric function benchmarks",
+    column_name="num_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)

From a254519618dbd43ff59a5fc67e4c56381a77d2d8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 16:18:11 -0700
Subject: [PATCH 6/6] add conditional suite

---
 microbenchmarks/README.md             |   3 +-
 microbenchmarks/suites/__init__.py    |   2 +
 microbenchmarks/suites/conditional.py | 110 ++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 microbenchmarks/suites/conditional.py

diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
index 1001816..1fbfdc6 100644
--- a/microbenchmarks/README.md
+++ b/microbenchmarks/README.md
@@ -32,6 +32,7 @@ Benchmarks are organized into **suites**, each focusing on a specific category o
 | `strings` | String manipulation functions (trim, lower, upper, concat, etc.) | 27 |
 | `temporal` | Date/time functions (year, month, date_trunc, etc.) | 21 |
 | `numeric` | Math functions (sqrt, pow, sin, cos, log, round, etc.) | 38 |
+| `conditional` | Conditional logic (CASE, COALESCE, boolean ops, comparisons) | 36 |
 
 All benchmarks run in single-threaded mode for fair comparison between engines.
 
@@ -58,7 +59,7 @@ python microbenchmarks.py --suite strings
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`) |
+| `--suite` | `strings` | Benchmark suite to run (`strings`, `temporal`, `numeric`, `conditional`) |
 | `--rows` | `1000000` | Number of rows in the generated test data |
 | `--warmup` | `2` | Number of warmup iterations before timing |
 | `--iterations` | `5` | Number of timed iterations (results are averaged) |
diff --git a/microbenchmarks/suites/__init__.py b/microbenchmarks/suites/__init__.py
index 9527f1a..329ffba 100644
--- a/microbenchmarks/suites/__init__.py
+++ b/microbenchmarks/suites/__init__.py
@@ -27,12 +27,14 @@ class Suite:
 from . import strings
 from . import temporal
 from . import numeric
+from . import conditional
 
 # Registry of available suites
 SUITES: dict[str, Suite] = {
     'strings': strings.SUITE,
     'temporal': temporal.SUITE,
     'numeric': numeric.SUITE,
+    'conditional': conditional.SUITE,
 }
 
 
diff --git a/microbenchmarks/suites/conditional.py b/microbenchmarks/suites/conditional.py
new file mode 100644
index 0000000..0ebbf2a
--- /dev/null
+++ b/microbenchmarks/suites/conditional.py
@@ -0,0 +1,110 @@
+"""Conditional/logic functions benchmark suite."""
+
+import random
+
+import pyarrow as pa
+
+from . import BenchmarkFunction, Suite
+
+
+FUNCTIONS = [
+    # CASE expressions
+    BenchmarkFunction("case_simple",
+        "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END",
+        "CASE {col} WHEN 1 THEN 'one' WHEN 2 THEN 'two' ELSE 'other' END"),
+    BenchmarkFunction("case_searched",
+        "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END",
+        "CASE WHEN {col} < 0 THEN 'negative' WHEN {col} = 0 THEN 'zero' ELSE 'positive' END"),
+    BenchmarkFunction("case_many_branches",
+        "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END",
+        "CASE WHEN {col} < -50 THEN 'a' WHEN {col} < -25 THEN 'b' WHEN {col} < 0 THEN 'c' WHEN {col} < 25 THEN 'd' WHEN {col} < 50 THEN 'e' ELSE 'f' END"),
+    BenchmarkFunction("case_nested",
+        "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END",
+        "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END"),
+
+    # NULL handling
+    BenchmarkFunction("coalesce_2", "COALESCE(nullable_col, 0)", "COALESCE(nullable_col, 0)"),
+    BenchmarkFunction("coalesce_3", "COALESCE(nullable_col, {col}, 0)", "COALESCE(nullable_col, {col}, 0)"),
+    BenchmarkFunction("coalesce_many", "COALESCE(nullable_col, NULL, NULL, {col}, 0)", "COALESCE(nullable_col, NULL, NULL, {col}, 0)"),
+    BenchmarkFunction("nullif", "NULLIF({col}, 0)", "NULLIF({col}, 0)"),
+    BenchmarkFunction("nullif_expr", "NULLIF({col} % 10, 5)", "NULLIF({col} % 10, 5)"),
+    BenchmarkFunction("ifnull", "IFNULL(nullable_col, -1)", "IFNULL(nullable_col, -1)"),
+    BenchmarkFunction("nvl", "NVL(nullable_col, -1)", "IFNULL(nullable_col, -1)"),
+
+    # Comparison functions
+    BenchmarkFunction("greatest_2", "GREATEST({col}, {col} * -1)", "GREATEST({col}, {col} * -1)"),
+    BenchmarkFunction("greatest_3", "GREATEST({col}, 0, -100)", "GREATEST({col}, 0, -100)"),
+    BenchmarkFunction("least_2", "LEAST({col}, {col} * -1)", "LEAST({col}, {col} * -1)"),
+    BenchmarkFunction("least_3", "LEAST({col}, 0, 100)", "LEAST({col}, 0, 100)"),
+
+    # Boolean logic
+    BenchmarkFunction("and_simple", "{col} > 0 AND {col} < 50", "{col} > 0 AND {col} < 50"),
+    BenchmarkFunction("or_simple", "{col} < -50 OR {col} > 50", "{col} < -50 OR {col} > 50"),
+    BenchmarkFunction("not", "NOT ({col} > 0)", "NOT ({col} > 0)"),
+    BenchmarkFunction("and_or_mixed", "({col} > 0 AND {col} < 50) OR {col} < -50", "({col} > 0 AND {col} < 50) OR {col} < -50"),
+    BenchmarkFunction("complex_bool", "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0",
+                                      "({col} > 0 AND {col} < 25) OR ({col} < 0 AND {col} > -25) OR {col} = 0"),
+
+    # Comparison operators
+    BenchmarkFunction("eq", "{col} = 0", "{col} = 0"),
+    BenchmarkFunction("neq", "{col} <> 0", "{col} <> 0"),
+    BenchmarkFunction("lt", "{col} < 0", "{col} < 0"),
+    BenchmarkFunction("lte", "{col} <= 0", "{col} <= 0"),
+    BenchmarkFunction("gt", "{col} > 0", "{col} > 0"),
+    BenchmarkFunction("gte", "{col} >= 0", "{col} >= 0"),
+
+    # BETWEEN and IN
+    BenchmarkFunction("between", "{col} BETWEEN -50 AND 50", "{col} BETWEEN -50 AND 50"),
+    BenchmarkFunction("not_between", "{col} NOT BETWEEN -25 AND 25", "{col} NOT BETWEEN -25 AND 25"),
+    BenchmarkFunction("in_list_small", "{col} IN (1, 2, 3, 4, 5)", "{col} IN (1, 2, 3, 4, 5)"),
+    BenchmarkFunction("in_list_medium", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)", "{col} IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"),
+    BenchmarkFunction("not_in", "{col} NOT IN (1, 2, 3, 4, 5)", "{col} NOT IN (1, 2, 3, 4, 5)"),
+
+    # NULL checks
+    BenchmarkFunction("is_null", "nullable_col IS NULL", "nullable_col IS NULL"),
+    BenchmarkFunction("is_not_null", "nullable_col IS NOT NULL", "nullable_col IS NOT NULL"),
+
+    # IF (conditional expression) - DataFusion uses CASE, DuckDB has IF
+    BenchmarkFunction("if_simple",
+        "CASE WHEN {col} > 0 THEN 'positive' ELSE 'non-positive' END",
+        "IF({col} > 0, 'positive', 'non-positive')"),
+    BenchmarkFunction("if_numeric",
+        "CASE WHEN {col} > 0 THEN {col} ELSE 0 END",
+        "IF({col} > 0, {col}, 0)"),
+    BenchmarkFunction("if_nested",
+        "CASE WHEN {col} > 0 THEN CASE WHEN {col} > 50 THEN 'high' ELSE 'low' END ELSE 'negative' END",
+        "IF({col} > 0, IF({col} > 50, 'high', 'low'), 'negative')"),
+]
+
+
+def generate_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
+    """Generate test data with integers and nullable values."""
+    random.seed(42)
+
+    values = []
+    nullable_values = []
+
+    for i in range(num_rows):
+        # Integer values in range for various conditional tests
+        v = random.randint(-100, 100)
+        values.append(v)
+
+        # Nullable column: ~30% nulls
+        if random.random() < 0.3:
+            nullable_values.append(None)
+        else:
+            nullable_values.append(random.randint(-100, 100))
+
+    return pa.table({
+        'val_col': pa.array(values, type=pa.int64()),
+        'nullable_col': pa.array(nullable_values, type=pa.int64()),
+    })
+
+
+SUITE = Suite(
+    name="conditional",
+    description="Conditional/logic function benchmarks",
+    column_name="val_col",
+    functions=FUNCTIONS,
+    generate_data=generate_data,
+)