diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000000..9dd627b01abed
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,13 @@
+FROM rust:bookworm
+
+RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
+ # Remove imagemagick due to https://security-tracker.debian.org/tracker/CVE-2019-10131
+ && apt-get purge -y imagemagick imagemagick-6-common
+
+# Add protoc
+# https://datafusion.apache.org/contributor-guide/getting_started.html#protoc-installation
+RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v25.1/protoc-25.1-linux-x86_64.zip \
+ && unzip protoc-25.1-linux-x86_64.zip -d $HOME/.local \
+ && rm protoc-25.1-linux-x86_64.zip
+
+ENV PATH="$PATH:$HOME/.local/bin"
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000000000..1af22306ed8c9
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,16 @@
+{
+ "build": {
+ "dockerfile": "./Dockerfile",
+ "context": "."
+ },
+ "customizations": {
+ "vscode": {
+ "extensions": [
+ "rust-lang.rust-analyzer"
+ ]
+ }
+ },
+ "features": {
+ "ghcr.io/devcontainers/features/rust:1": "latest"
+ }
+}
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
index 5578517ec3594..22d2f2187dd07 100644
--- a/.github/actions/setup-builder/action.yaml
+++ b/.github/actions/setup-builder/action.yaml
@@ -28,16 +28,18 @@ runs:
- name: Install Build Dependencies
shell: bash
run: |
- apt-get update
- apt-get install -y protobuf-compiler
+ RETRY=("ci/scripts/retry" timeout 120)
+ "${RETRY[@]}" apt-get update
+ "${RETRY[@]}" apt-get install -y protobuf-compiler
- name: Setup Rust toolchain
shell: bash
# rustfmt is needed for the substrait build script
run: |
+ RETRY=("ci/scripts/retry" timeout 120)
echo "Installing ${{ inputs.rust-version }}"
- rustup toolchain install ${{ inputs.rust-version }}
- rustup default ${{ inputs.rust-version }}
- rustup component add rustfmt
+ "${RETRY[@]}" rustup toolchain install ${{ inputs.rust-version }}
+ "${RETRY[@]}" rustup default ${{ inputs.rust-version }}
+ "${RETRY[@]}" rustup component add rustfmt
- name: Configure rust runtime env
uses: ./.github/actions/setup-rust-runtime
- name: Fixup git permissions
diff --git a/.github/actions/setup-macos-aarch64-builder/action.yaml b/.github/actions/setup-macos-aarch64-builder/action.yaml
index c4e14906ed108..288799a284b01 100644
--- a/.github/actions/setup-macos-aarch64-builder/action.yaml
+++ b/.github/actions/setup-macos-aarch64-builder/action.yaml
@@ -30,8 +30,8 @@ runs:
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
- export PROTO_ZIP="protoc-21.4-osx-aarch_64.zip"
- curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
+ export PROTO_ZIP="protoc-29.1-osx-aarch_64.zip"
+ curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.1/$PROTO_ZIP
unzip $PROTO_ZIP
echo "$HOME/d/protoc/bin" >> $GITHUB_PATH
export PATH=$PATH:$HOME/d/protoc/bin
@@ -43,5 +43,7 @@ runs:
rustup toolchain install stable
rustup default stable
rustup component add rustfmt
+ - name: Setup rust cache
+ uses: Swatinem/rust-cache@v2
- name: Configure rust runtime env
uses: ./.github/actions/setup-rust-runtime
diff --git a/.github/actions/setup-macos-builder/action.yaml b/.github/actions/setup-macos-builder/action.yaml
index 02419f6179429..fffdab160b043 100644
--- a/.github/actions/setup-macos-builder/action.yaml
+++ b/.github/actions/setup-macos-builder/action.yaml
@@ -30,8 +30,8 @@ runs:
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
- export PROTO_ZIP="protoc-21.4-osx-x86_64.zip"
- curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
+ export PROTO_ZIP="protoc-29.1-osx-x86_64.zip"
+ curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.1/$PROTO_ZIP
unzip $PROTO_ZIP
echo "$HOME/d/protoc/bin" >> $GITHUB_PATH
export PATH=$PATH:$HOME/d/protoc/bin
diff --git a/.github/actions/setup-windows-builder/action.yaml b/.github/actions/setup-windows-builder/action.yaml
index 5e937358c7d74..a0304168c744e 100644
--- a/.github/actions/setup-windows-builder/action.yaml
+++ b/.github/actions/setup-windows-builder/action.yaml
@@ -30,8 +30,8 @@ runs:
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
- export PROTO_ZIP="protoc-21.4-win64.zip"
- curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
+ export PROTO_ZIP="protoc-29.1-win64.zip"
+ curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v29.1/$PROTO_ZIP
unzip $PROTO_ZIP
export PATH=$PATH:$HOME/d/protoc/bin
protoc.exe --version
diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml
index ebc5bcf91c94b..f87215565bb53 100644
--- a/.github/workflows/dependencies.yml
+++ b/.github/workflows/dependencies.yml
@@ -42,6 +42,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 19af21ec910be..cf204b2cd6c12 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -23,18 +23,12 @@ concurrency:
cancel-in-progress: true
jobs:
- rat:
- name: Release Audit Tool (RAT)
+ license-header-check:
runs-on: ubuntu-latest
+ name: Check License Header
steps:
- - name: Checkout
- uses: actions/checkout@v4
- - name: Setup Python
- uses: actions/setup-python@v5
- with:
- python-version: "3.10"
- - name: Audit licenses
- run: ./dev/release/run-rat.sh .
+ - uses: actions/checkout@v4
+ - uses: korandoru/hawkeye@v5
prettier:
name: Use prettier to check formatting of documents
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 44ca5aaf4eda1..0b43339f57a61 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -26,7 +26,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
- python-version: "3.10"
+ python-version: "3.12"
- name: Install dependencies
run: |
@@ -61,4 +61,4 @@ jobs:
git add --all
git commit -m 'Publish built docs triggered by ${{ github.sha }}'
git push || git push --force
- fi
\ No newline at end of file
+ fi
diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml
index c2f3dd684a23e..3fad08643aa22 100644
--- a/.github/workflows/docs_pr.yaml
+++ b/.github/workflows/docs_pr.yaml
@@ -43,6 +43,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
new file mode 100644
index 0000000000000..b98e0a1740cbe
--- /dev/null
+++ b/.github/workflows/extended.yml
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Datafusion extended tests
+
+concurrency:
+ group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+ cancel-in-progress: true
+
+# https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#running-your-pull_request-workflow-when-a-pull-request-merges
+#
+# this job is intended to only run only on the main branch as it is time consuming
+# and should not fail often. However, it is important coverage to ensure correctness
+# in the (very rare) event of a hash failure or sqlite query failure.
+on:
+ # Run on all commits to main
+ push:
+ branches:
+ - main
+
+jobs:
+ # Check answers are correct when hash values collide
+ hash-collisions:
+ name: cargo test hash collisions (amd64)
+ runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ fetch-depth: 1
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
+ - name: Run tests
+ run: |
+ cd datafusion
+ cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro
+ sqllogictest-sqlite:
+ name: "Run sqllogictests with the sqlite test suite"
+ runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ fetch-depth: 1
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
+ - name: Run sqllogictest
+ run: cargo test --profile release-nonlto --test sqllogictests -- --include-sqlite
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 4527d047e4c07..7ac0dfa78215c 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -39,9 +39,17 @@ on:
workflow_dispatch:
jobs:
- # Check crate compiles
+ # Check license header
+ license-header-check:
+ runs-on: ubuntu-20.04
+ name: Check License Header
+ steps:
+ - uses: actions/checkout@v4
+ - uses: korandoru/hawkeye@v5
+
+ # Check crate compiles and base cargo check passes
linux-build-lib:
- name: cargo check
+ name: linux build test
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -51,89 +59,119 @@ jobs:
uses: ./.github/actions/setup-builder
with:
rust-version: stable
+ - name: Prepare cargo build
+ run: cargo check --profile ci --all-targets
- - name: Cache Cargo
- uses: actions/cache@v4
+ # cargo check common, functions and substrait with no default features
+ linux-cargo-check-no-default-features:
+ name: cargo check no default features
+ needs: linux-build-lib
+ runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
with:
- path: |
- ~/.cargo/bin/
- ~/.cargo/registry/index/
- ~/.cargo/registry/cache/
- ~/.cargo/git/db/
- ./target/
- ./datafusion-cli/target/
- key: cargo-cache-${{ hashFiles('**/Cargo.toml', '**/Cargo.lock') }}
-
+ rust-version: stable
- name: Check datafusion without default features
# Some of the test binaries require the parquet feature still
#run: cargo check --all-targets --no-default-features -p datafusion
- run: cargo check --no-default-features -p datafusion
+ run: cargo check --profile ci --no-default-features -p datafusion
- name: Check datafusion-common without default features
- run: cargo check --all-targets --no-default-features -p datafusion-common
+ run: cargo check --profile ci --all-targets --no-default-features -p datafusion-common
- - name: Check datafusion-functions
- run: cargo check --all-targets --no-default-features -p datafusion-functions
+ - name: Check datafusion-functions without default features
+ run: cargo check --profile ci --all-targets --no-default-features -p datafusion-functions
+
+ - name: Check datafusion-substrait without default features
+ run: cargo check --profile ci --all-targets --no-default-features -p datafusion-substrait
- name: Check workspace in debug mode
- run: cargo check --all-targets --workspace
+ run: cargo check --profile ci --all-targets --workspace
- name: Check workspace with avro,json features
- run: cargo check --workspace --benches --features avro,json
+ run: cargo check --profile ci --workspace --benches --features avro,json
- name: Check Cargo.lock for datafusion-cli
run: |
# If this test fails, try running `cargo update` in the `datafusion-cli` directory
# and check in the updated Cargo.lock file.
- cargo check --manifest-path datafusion-cli/Cargo.toml --locked
+ cargo check --profile ci --manifest-path datafusion-cli/Cargo.toml --locked
- # Ensure that the datafusion crate can be built with only a subset of the function
- # packages enabled.
+ # cargo check datafusion to ensure that the datafusion crate can be built with only a
+ # subset of the function packages enabled.
+ linux-cargo-check-datafusion:
+ name: cargo check datafusion
+ needs: linux-build-lib
+ runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
- name: Check datafusion (nested_expressions)
- run: cargo check --no-default-features --features=nested_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=nested_expressions -p datafusion
- name: Check datafusion (crypto)
- run: cargo check --no-default-features --features=crypto_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=crypto_expressions -p datafusion
- name: Check datafusion (datetime_expressions)
- run: cargo check --no-default-features --features=datetime_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=datetime_expressions -p datafusion
- name: Check datafusion (encoding_expressions)
- run: cargo check --no-default-features --features=encoding_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=encoding_expressions -p datafusion
- name: Check datafusion (math_expressions)
- run: cargo check --no-default-features --features=math_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=math_expressions -p datafusion
- name: Check datafusion (regex_expressions)
- run: cargo check --no-default-features --features=regex_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=regex_expressions -p datafusion
- name: Check datafusion (string_expressions)
- run: cargo check --no-default-features --features=string_expressions -p datafusion
+ run: cargo check --profile ci --no-default-features --features=string_expressions -p datafusion
- # Ensure that the datafusion-functions crate can be built with only a subset of the function
- # packages enabled.
+ # cargo check datafusion-functions to ensure that the datafusion-functions crate can be built with
+ # only a subset of the function packages enabled.
+ linux-cargo-check-datafusion-functions:
+ name: cargo check functions
+ needs: linux-build-lib
+ runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
- name: Check datafusion-functions (crypto)
- run: cargo check --all-targets --no-default-features --features=crypto_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=crypto_expressions -p datafusion-functions
- name: Check datafusion-functions (datetime_expressions)
- run: cargo check --all-targets --no-default-features --features=datetime_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=datetime_expressions -p datafusion-functions
- name: Check datafusion-functions (encoding_expressions)
- run: cargo check --all-targets --no-default-features --features=encoding_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=encoding_expressions -p datafusion-functions
- name: Check datafusion-functions (math_expressions)
- run: cargo check --all-targets --no-default-features --features=math_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=math_expressions -p datafusion-functions
- name: Check datafusion-functions (regex_expressions)
- run: cargo check --all-targets --no-default-features --features=regex_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=regex_expressions -p datafusion-functions
- name: Check datafusion-functions (string_expressions)
- run: cargo check --all-targets --no-default-features --features=string_expressions -p datafusion-functions
+ run: cargo check --profile ci --all-targets --no-default-features --features=string_expressions -p datafusion-functions
# Run tests
linux-test:
name: cargo test (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -141,18 +179,19 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
- rust-version: stable
+ rust-version: stable
- name: Run tests (excluding doctests)
- run: cargo test --lib --tests --bins --features avro,json,backtrace
+ run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace
- name: Verify Working Directory Clean
run: git diff --exit-code
linux-test-datafusion-cli:
name: cargo test datafusion-cli (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -160,6 +199,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -167,13 +207,13 @@ jobs:
- name: Run tests (excluding doctests)
run: |
cd datafusion-cli
- cargo test --lib --tests --bins --all-features
+ cargo test --profile ci --lib --tests --bins --all-features
- name: Verify Working Directory Clean
run: git diff --exit-code
linux-test-example:
name: cargo examples (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -181,6 +221,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -188,18 +229,16 @@ jobs:
- name: Run examples
run: |
# test datafusion-sql examples
- cargo run --example sql
+ cargo run --profile ci --example sql
# test datafusion-examples
ci/scripts/rust_example.sh
- name: Verify Working Directory Clean
run: git diff --exit-code
-
-
# Run `cargo test doc` (test documentation examples)
linux-test-doc:
name: cargo test doc (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -207,22 +246,23 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: stable
- name: Run doctests
run: |
- cargo test --doc --features avro,json
+ cargo test --profile ci --doc --features avro,json
cd datafusion-cli
- cargo test --doc --all-features
+ cargo test --profile ci --doc --all-features
- name: Verify Working Directory Clean
run: git diff --exit-code
# Run `cargo doc` to ensure the rustdoc is clean
linux-rustdoc:
name: cargo doc
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -255,7 +295,7 @@ jobs:
# verify that the benchmark queries return the correct results
verify-benchmark-results:
name: verify benchmark results (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -263,6 +303,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -277,17 +318,20 @@ jobs:
mv *.tbl ../datafusion/sqllogictest/test_files/tpch/data
- name: Verify that benchmark queries return expected results
run: |
+ # increase stack size to fix stack overflow
+ export RUST_MIN_STACK=20971520
export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data`
- # use release build for plan verificaton because debug build causes stack overflow
- cargo test plan_q --package datafusion-benchmarks --profile release-nonlto --features=ci -- --test-threads=1
- INCLUDE_TPCH=true cargo test --test sqllogictests
+ cargo test plan_q --package datafusion-benchmarks --profile ci --features=ci -- --test-threads=1
+ INCLUDE_TPCH=true cargo test --profile ci --package datafusion-sqllogictest --test sqllogictests
- name: Verify Working Directory Clean
run: git diff --exit-code
sqllogictest-postgres:
name: "Run sqllogictest with Postgres runner"
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
+ container:
+ image: amd64/rust
services:
postgres:
image: postgres:15
@@ -296,7 +340,7 @@ jobs:
POSTGRES_DB: db_test
POSTGRES_INITDB_ARGS: --encoding=UTF-8 --lc-collate=C --lc-ctype=C
ports:
- - 5432/tcp
+ - 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
@@ -306,47 +350,59 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
- - name: Setup toolchain
- run: |
- rustup toolchain install stable
- rustup default stable
+ fetch-depth: 1
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
- name: Run sqllogictest
- run: PG_COMPAT=true PG_URI="postgresql://postgres:postgres@localhost:$POSTGRES_PORT/db_test" cargo test --features=postgres --test sqllogictests
+ run: |
+ cd datafusion/sqllogictest
+ PG_COMPAT=true PG_URI="postgresql://postgres:postgres@$POSTGRES_HOST:$POSTGRES_PORT/db_test" cargo test --profile ci --features=postgres --test sqllogictests
env:
+ # use postgres for the host here because we have specified a container for the job
+ POSTGRES_HOST: postgres
POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}
- windows:
- name: cargo test (win64)
- runs-on: windows-latest
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: true
- - name: Setup Rust toolchain
- uses: ./.github/actions/setup-windows-builder
- - name: Run tests (excluding doctests)
- shell: bash
- run: |
- export PATH=$PATH:$HOME/d/protoc/bin
- cargo test --lib --tests --bins --features avro,json,backtrace
- cd datafusion-cli
- cargo test --lib --tests --bins --all-features
-
- macos:
- name: cargo test (macos)
- runs-on: macos-latest
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: true
- - name: Setup Rust toolchain
- uses: ./.github/actions/setup-macos-builder
- - name: Run tests (excluding doctests)
- shell: bash
- run: |
- cargo test --lib --tests --bins --features avro,json,backtrace
- cd datafusion-cli
- cargo test --lib --tests --bins --all-features
+# Temporarily commenting out the Windows flow, the reason is enormously slow running build
+# Waiting for new Windows 2025 github runner
+# Details: https://github.com/apache/datafusion/issues/13726
+#
+# windows:
+# name: cargo test (win64)
+# runs-on: windows-latest
+# steps:
+# - uses: actions/checkout@v4
+# with:
+# submodules: true
+# - name: Setup Rust toolchain
+# uses: ./.github/actions/setup-windows-builder
+# - name: Run tests (excluding doctests)
+# shell: bash
+# run: |
+# export PATH=$PATH:$HOME/d/protoc/bin
+# cargo test --lib --tests --bins --features avro,json,backtrace
+# cd datafusion-cli
+# cargo test --lib --tests --bins --all-features
+
+# Commenting out intel mac build as so few users would ever use it
+# Details: https://github.com/apache/datafusion/issues/13846
+# macos:
+# name: cargo test (macos)
+# runs-on: macos-latest
+# steps:
+# - uses: actions/checkout@v4
+# with:
+# submodules: true
+# fetch-depth: 1
+# - name: Setup Rust toolchain
+# uses: ./.github/actions/setup-macos-builder
+# - name: Run tests (excluding doctests)
+# shell: bash
+# run: |
+# cargo test run --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace
+# cd datafusion-cli
+# cargo test run --profile ci --lib --tests --bins --all-features
macos-aarch64:
name: cargo test (macos-aarch64)
@@ -355,18 +411,19 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-macos-aarch64-builder
- name: Run tests (excluding doctests)
shell: bash
run: |
- cargo test --lib --tests --bins --features avro,json,backtrace
+ cargo test --profile ci --lib --tests --bins --features avro,json,backtrace
cd datafusion-cli
- cargo test --lib --tests --bins --all-features
+ cargo test --profile ci --lib --tests --bins --all-features
test-datafusion-pyarrow:
name: cargo test pyarrow (amd64)
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-20.04
container:
image: amd64/rust:bullseye # Workaround https://github.com/actions/setup-python/issues/721
@@ -374,6 +431,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- uses: actions/setup-python@v5
with:
python-version: "3.8"
@@ -386,7 +444,7 @@ jobs:
with:
rust-version: stable
- name: Run datafusion-common tests
- run: cargo test -p datafusion-common --features=pyarrow
+ run: cargo test --profile ci -p datafusion-common --features=pyarrow
vendor:
name: Verify Vendored Code
@@ -397,6 +455,8 @@ jobs:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
+ with:
+ rust-version: stable
- name: Run gen
run: ./regen.sh
working-directory: ./datafusion/proto
@@ -463,7 +523,7 @@ jobs:
clippy:
name: clippy
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -471,6 +531,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -480,29 +541,9 @@ jobs:
- name: Run clippy
run: ci/scripts/rust_clippy.sh
- # Check answers are correct when hash values collide
- hash-collisions:
- name: cargo test hash collisions (amd64)
- needs: [ linux-build-lib ]
- runs-on: ubuntu-latest
- container:
- image: amd64/rust
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: true
- - name: Setup Rust toolchain
- uses: ./.github/actions/setup-builder
- with:
- rust-version: stable
- - name: Run tests
- run: |
- cd datafusion
- cargo test --lib --tests --features=force_hash_collisions,avro
-
cargo-toml-formatting-checks:
name: check Cargo.toml formatting
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -510,6 +551,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -522,7 +564,7 @@ jobs:
config-docs-check:
name: check configs.md and ***_functions.md is up-to-date
- needs: [ linux-build-lib ]
+ needs: linux-build-lib
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -530,6 +572,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
+ fetch-depth: 1
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
@@ -574,9 +617,9 @@ jobs:
#
# To reproduce:
# 1. Install the version of Rust that is failing. Example:
- # rustup install 1.78.0
+ # rustup install 1.80.1
# 2. Run the command that failed with that version. Example:
- # cargo +1.78.0 check -p datafusion
+ # cargo +1.80.1 check -p datafusion
#
# To resolve, either:
# 1. Change your code to use older Rust features,
@@ -595,4 +638,4 @@ jobs:
run: cargo msrv --output-format json --log-target stdout verify
- name: Check datafusion-cli
working-directory: datafusion-cli
- run: cargo msrv --output-format json --log-target stdout verify
\ No newline at end of file
+ run: cargo msrv --output-format json --log-target stdout verify
diff --git a/.gitignore b/.gitignore
index 05570eacf630c..1fa79249ff8e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,6 +64,12 @@ datafusion/sqllogictests/test_files/tpch/data/*
# Scratch temp dir for sqllogictests
datafusion/sqllogictest/test_files/scratch*
+# temp file for core
+datafusion/core/*.parquet
+
+# Generated core benchmark data
+datafusion/core/benches/data/*
+
# rat
filtered_rat.txt
rat.txt
diff --git a/.gitmodules b/.gitmodules
index ec5d6208b8ddb..037accdbe4241 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
[submodule "testing"]
path = testing
url = https://github.com/apache/arrow-testing
+[submodule "datafusion-testing"]
+ path = datafusion-testing
+ url = https://github.com/apache/datafusion-testing.git
+ branch = main
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea0c339ac4514..c481ce0b96a0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@
under the License.
-->
-* [DataFusion CHANGELOG](./datafusion/CHANGELOG.md)
+Change logs for each release can be found [here](dev/changelog).
+
For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md).
diff --git a/Cargo.toml b/Cargo.toml
index 448607257ca1e..aa412cba51087 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,9 +26,11 @@ members = [
"datafusion/expr",
"datafusion/expr-common",
"datafusion/execution",
+ "datafusion/ffi",
"datafusion/functions",
"datafusion/functions-aggregate",
"datafusion/functions-aggregate-common",
+ "datafusion/functions-table",
"datafusion/functions-nested",
"datafusion/functions-window",
"datafusion/functions-window-common",
@@ -46,8 +48,13 @@ members = [
"datafusion/substrait",
"datafusion/wasmtest",
"datafusion-examples",
+ "datafusion-examples/examples/ffi/ffi_example_table_provider",
+ "datafusion-examples/examples/ffi/ffi_module_interface",
+ "datafusion-examples/examples/ffi/ffi_module_loader",
"test-utils",
"benchmarks",
+ "datafusion/macros",
+ "datafusion/doc",
]
resolver = "2"
@@ -58,75 +65,75 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
-rust-version = "1.78"
-version = "42.0.0"
+rust-version = "1.80.1"
+version = "44.0.0"
[workspace.dependencies]
# We turn off default-features for some dependencies here so the workspaces which inherit them can
# selectively turn them on if needed, since we can override default-features = true (from false)
# for the inherited dependency but cannot do the reverse (override from true to false).
#
-# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
+# See for more details: https://github.com/rust-lang/cargo/issues/11329
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
-arrow = { version = "53.1.0", features = [
+arrow = { version = "54.0.0", features = [
"prettyprint",
] }
-arrow-array = { version = "53.1.0", default-features = false, features = [
+arrow-array = { version = "54.0.0", default-features = false, features = [
"chrono-tz",
] }
-arrow-buffer = { version = "53.1.0", default-features = false }
-arrow-flight = { version = "53.1.0", features = [
+arrow-buffer = { version = "54.0.0", default-features = false }
+arrow-flight = { version = "54.0.0", features = [
"flight-sql-experimental",
] }
-arrow-ipc = { version = "53.1.0", default-features = false, features = [
+arrow-ipc = { version = "54.0.0", default-features = false, features = [
"lz4",
] }
-arrow-ord = { version = "53.1.0", default-features = false }
-arrow-schema = { version = "53.1.0", default-features = false }
-arrow-string = { version = "53.1.0", default-features = false }
+arrow-ord = { version = "54.0.0", default-features = false }
+arrow-schema = { version = "54.0.0", default-features = false }
async-trait = "0.1.73"
-bigdecimal = "=0.4.1"
+bigdecimal = "0.4.7"
bytes = "1.4"
chrono = { version = "0.4.38", default-features = false }
-ctor = "0.2.0"
+ctor = "0.2.9"
dashmap = "6.0.1"
-datafusion = { path = "datafusion/core", version = "42.0.0", default-features = false }
-datafusion-catalog = { path = "datafusion/catalog", version = "42.0.0" }
-datafusion-common = { path = "datafusion/common", version = "42.0.0", default-features = false }
-datafusion-common-runtime = { path = "datafusion/common-runtime", version = "42.0.0" }
-datafusion-execution = { path = "datafusion/execution", version = "42.0.0" }
-datafusion-expr = { path = "datafusion/expr", version = "42.0.0" }
-datafusion-expr-common = { path = "datafusion/expr-common", version = "42.0.0" }
-datafusion-functions = { path = "datafusion/functions", version = "42.0.0" }
-datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "42.0.0" }
-datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "42.0.0" }
-datafusion-functions-nested = { path = "datafusion/functions-nested", version = "42.0.0" }
-datafusion-functions-window = { path = "datafusion/functions-window", version = "42.0.0" }
-datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "42.0.0" }
-datafusion-optimizer = { path = "datafusion/optimizer", version = "42.0.0", default-features = false }
-datafusion-physical-expr = { path = "datafusion/physical-expr", version = "42.0.0", default-features = false }
-datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "42.0.0", default-features = false }
-datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "42.0.0" }
-datafusion-physical-plan = { path = "datafusion/physical-plan", version = "42.0.0" }
-datafusion-proto = { path = "datafusion/proto", version = "42.0.0" }
-datafusion-proto-common = { path = "datafusion/proto-common", version = "42.0.0" }
-datafusion-sql = { path = "datafusion/sql", version = "42.0.0" }
-datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "42.0.0" }
-datafusion-substrait = { path = "datafusion/substrait", version = "42.0.0" }
+datafusion = { path = "datafusion/core", version = "44.0.0", default-features = false }
+datafusion-catalog = { path = "datafusion/catalog", version = "44.0.0" }
+datafusion-common = { path = "datafusion/common", version = "44.0.0", default-features = false }
+datafusion-common-runtime = { path = "datafusion/common-runtime", version = "44.0.0" }
+datafusion-doc = { path = "datafusion/doc", version = "44.0.0" }
+datafusion-execution = { path = "datafusion/execution", version = "44.0.0" }
+datafusion-expr = { path = "datafusion/expr", version = "44.0.0" }
+datafusion-expr-common = { path = "datafusion/expr-common", version = "44.0.0" }
+datafusion-ffi = { path = "datafusion/ffi", version = "44.0.0" }
+datafusion-functions = { path = "datafusion/functions", version = "44.0.0" }
+datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "44.0.0" }
+datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "44.0.0" }
+datafusion-functions-nested = { path = "datafusion/functions-nested", version = "44.0.0" }
+datafusion-functions-table = { path = "datafusion/functions-table", version = "44.0.0" }
+datafusion-functions-window = { path = "datafusion/functions-window", version = "44.0.0" }
+datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "44.0.0" }
+datafusion-macros = { path = "datafusion/macros", version = "44.0.0" }
+datafusion-optimizer = { path = "datafusion/optimizer", version = "44.0.0", default-features = false }
+datafusion-physical-expr = { path = "datafusion/physical-expr", version = "44.0.0", default-features = false }
+datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "44.0.0", default-features = false }
+datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "44.0.0" }
+datafusion-physical-plan = { path = "datafusion/physical-plan", version = "44.0.0" }
+datafusion-proto = { path = "datafusion/proto", version = "44.0.0" }
+datafusion-proto-common = { path = "datafusion/proto-common", version = "44.0.0" }
+datafusion-sql = { path = "datafusion/sql", version = "44.0.0" }
doc-comment = "0.3"
env_logger = "0.11"
futures = "0.3"
half = { version = "2.2.1", default-features = false }
hashbrown = { version = "0.14.5", features = ["raw"] }
indexmap = "2.0.0"
-itertools = "0.13"
+itertools = "0.14"
log = "^0.4"
-num_cpus = "1.13.0"
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
-parquet = { version = "53.1.0", default-features = false, features = [
+parquet = { version = "54.0.0", default-features = false, features = [
"arrow",
"async",
"object_store",
@@ -136,14 +143,14 @@ pbjson = { version = "0.7.0" }
prost = "0.13.1"
prost-derive = "0.13.1"
rand = "0.8"
+recursive = "0.1.1"
regex = "1.8"
-rstest = "0.23.0"
+rstest = "0.24.0"
serde_json = "1"
-sqlparser = { version = "0.51.0", features = ["visitor"] }
+sqlparser = { version = "0.53.0", features = ["visitor"] }
tempfile = "3"
-thiserror = "1.0.44"
tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
-url = "2.2"
+url = "2.5.4"
[profile.release]
codegen-units = 1
@@ -163,9 +170,21 @@ overflow-checks = false
panic = 'unwind'
rpath = false
+[profile.ci]
+inherits = "dev"
+incremental = false
+
+# ci turns off debug info, etc for dependencies to allow for smaller binaries making caching more effective
+[profile.ci.package."*"]
+debug = false
+debug-assertions = false
+strip = "debuginfo"
+incremental = false
+
[workspace.lints.clippy]
# Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
large_futures = "warn"
[workspace.lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
+unused_qualifications = "deny"
diff --git a/README.md b/README.md
index 5d0b096c1de11..2a20faa9e2fa5 100644
--- a/README.md
+++ b/README.md
@@ -38,14 +38,15 @@
[Chat](https://discord.com/channels/885562378132000778/885562378132000781)
-
+
DataFusion is an extensible query engine written in [Rust] that
uses [Apache Arrow] as its in-memory format.
-The DataFusion libraries in this repository are used to build data-centric system software. DataFusion also provides the
-following subprojects, which are packaged versions of DataFusion intended for end users.
+This crate provides libraries and binaries for developers building fast and
+feature rich database and analytic systems, customized to particular workloads.
+See [use cases] for examples. The following related subprojects target end users:
- [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame
queries.
@@ -54,13 +55,10 @@ following subprojects, which are packaged versions of DataFusion intended for en
- [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on
DataFusion.
-The target audience for the DataFusion crates in this repository are
-developers building fast and feature rich database and analytic systems,
-customized to particular workloads. See [use cases] for examples.
-
-DataFusion offers [SQL] and [`Dataframe`] APIs,
-excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
-extensive customization, and a great community.
+"Out of the box,"
+DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance],
+built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and
+a great community.
DataFusion features a full query planner, a columnar, streaming, multi-threaded,
vectorized execution engine, and partitioned data sources. You can
@@ -114,7 +112,8 @@ Default features:
- `parquet`: support for reading the [Apache Parquet] format
- `regex_expressions`: regular expression functions, such as `regexp_match`
- `unicode_expressions`: Include unicode aware functions such as `character_length`
-- `unparser` : enables support to reverse LogicalPlans back into SQL
+- `unparser`: enables support to reverse LogicalPlans back into SQL
+- `recursive_protection`: uses [recursive](https://docs.rs/recursive/latest/recursive/) for stack overflow protection.
Optional features:
@@ -128,11 +127,46 @@ Optional features:
## Rust Version Compatibility Policy
-DataFusion's Minimum Required Stable Rust Version (MSRV) policy is to support stable [4 latest
-Rust versions](https://releases.rs) OR the stable minor Rust version as of 4 months, whichever is lower.
+The Rust toolchain releases are tracked at [Rust Versions](https://releases.rs) and follow
+[semantic versioning](https://semver.org/). A Rust toolchain release can be identified
+by a version string like `1.80.0`, or more generally `major.minor.patch`.
+
+DataFusion's supports the last 4 stable Rust minor versions released and any such versions released within the last 4 months.
For example, given the releases `1.78.0`, `1.79.0`, `1.80.0`, `1.80.1` and `1.81.0` DataFusion will support 1.78.0, which is 3 minor versions prior to the most minor recent `1.81`.
-If a hotfix is released for the minimum supported Rust version (MSRV), the MSRV will be the minor version with all hotfixes, even if it surpasses the four-month window.
+Note: If a Rust hotfix is released for the current MSRV, the MSRV will be updated to the specific minor version that includes all applicable hotfixes preceding other policies.
+
+DataFusion enforces MSRV policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
+
+## DataFusion API Evolution and Deprecation Guidelines
+
+Public methods in Apache DataFusion evolve over time: while we try to maintain a
+stable API, we also improve the API over time. As a result, we typically
+deprecate methods before removing them, according to the [deprecation guidelines].
+
+[deprecation guidelines]: https://datafusion.apache.org/library-user-guide/api-health.html
+
+## Dependencies and a `Cargo.lock`
+
+`datafusion` is intended for use as a library and thus purposely does not have a
+`Cargo.lock` file checked in. You can read more about the distinction in the
+[Cargo book].
+
+CI tests always run against the latest compatible versions of all dependencies
+(the equivalent of doing `cargo update`), as suggested in the [Cargo CI guide]
+and we rely on Dependabot for other upgrades. This strategy has two problems
+that occasionally arise:
+
+1. CI failures when downstream libraries upgrade in some non compatible way
+2. Local development builds that fail when DataFusion inadvertently relies on
+ a feature in a newer version of a dependency than declared in `Cargo.toml`
+ (e.g. a new method is added to a trait that we use).
+
+However, we think the current strategy is the best tradeoff between maintenance
+overhead and user experience and ensures DataFusion always works with the latest
+compatible versions of all dependencies. If you encounter either of these
+problems, please open an issue or PR.
-We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
+[cargo book]: https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+[cargo ci guide]: https://doc.rust-lang.org/cargo/guide/continuous-integration.html#verifying-latest-dependencies
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index 7f29f7471b6fc..ad8debaf2fa38 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -42,7 +42,6 @@ env_logger = { workspace = true }
futures = { workspace = true }
log = { workspace = true }
mimalloc = { version = "0.1", optional = true, default-features = false }
-num_cpus = { workspace = true }
parquet = { workspace = true, default-features = true }
serde = { version = "1.0.136", features = ["derive"] }
serde_json = { workspace = true }
diff --git a/benchmarks/README.md b/benchmarks/README.md
index afaf28bb75769..332cac8459d75 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,7 +32,7 @@ DataFusion is included in the benchmark setups for several popular
benchmarks that compare performance with other engines. For example:
* [ClickBench] scripts are in the [ClickBench repo](https://github.com/ClickHouse/ClickBench/tree/main/datafusion)
-* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](db-benchmark) directory
+* [H2o.ai `db-benchmark`] scripts are in [db-benchmark](https://github.com/apache/datafusion/tree/main/benchmarks/src/h2o.rs)
[ClickBench]: https://github.com/ClickHouse/ClickBench/tree/main
[H2o.ai `db-benchmark`]: https://github.com/h2oai/db-benchmark
@@ -330,6 +330,40 @@ steps.
The tests sort the entire dataset using several different sort
orders.
+## Sort TPCH
+
+Test performance of end-to-end sort SQL queries. (While the `Sort` benchmark focuses on a single sort executor, this benchmark tests how sorting is executed across multiple CPU cores by benchmarking sorting the whole relational table.)
+
+Sort integration benchmark runs whole table sort queries on TPCH `lineitem` table, with different characteristics. For example, different number of sort keys, different sort key cardinality, different number of payload columns, etc.
+
+See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.
+
+### Sort TPCH Benchmark Example Runs
+1. Run all queries with default setting:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json'
+```
+
+2. Run a specific query:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json' --query 2
+```
+
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh run sort_tpch
+```
+
+## IMDB
+
+Run Join Order Benchmark (JOB) on IMDB dataset.
+
+The Internet Movie Database (IMDB) dataset contains real-world movie data. Unlike synthetic datasets like TPCH, which assume uniform data distribution and uncorrelated columns, the IMDB dataset includes skewed data and correlated columns (which are common for real dataset), making it more suitable for testing query optimizers, particularly for cardinality estimation.
+
+This benchmark is derived from [Join Order Benchmark](https://github.com/gregrahn/join-order-benchmark).
+
+See paper [How Good Are Query Optimizers, Really](http://www.vldb.org/pvldb/vol9/p204-leis.pdf) for more details.
+
## TPCH
Run the tpch benchmark.
@@ -342,32 +376,79 @@ This benchmarks is derived from the [TPC-H][1] version
[2]: https://github.com/databricks/tpch-dbgen.git,
[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
+## External Aggregation
+
+Run the benchmark for aggregations with limited memory.
-# Older Benchmarks
+When the memory limit is exceeded, the aggregation intermediate results will be spilled to disk, and finally read back with sort-merge.
-## h2o benchmarks
+External aggregation benchmarks run several aggregation queries with different memory limits, on TPCH `lineitem` table. Queries can be found in [`external_aggr.rs`](src/bin/external_aggr.rs).
+This benchmark is inspired by [DuckDB's external aggregation paper](https://hannes.muehleisen.org/publications/icde2024-out-of-core-kuiper-boncz-muehleisen.pdf), specifically Section VI.
+
+### External Aggregation Example Runs
+1. Run all queries with predefined memory limits:
```bash
-cargo run --release --bin h2o group-by --query 1 --path /mnt/bigdata/h2oai/N_1e7_K_1e2_single.csv --mem-table --debug
+# Under 'benchmarks/' directory
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json'
```
-Example run:
+2. Run a query with specific memory limit:
+```bash
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json' --query 1 --memory-limit 30M
+```
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh data external_aggr
+./bench.sh run external_aggr
```
-Running benchmarks with the following options: GroupBy(GroupBy { query: 1, path: "/mnt/bigdata/h2oai/N_1e7_K_1e2_single.csv", debug: false })
-Executing select id1, sum(v1) as v1 from x group by id1
-+-------+--------+
-| id1 | v1 |
-+-------+--------+
-| id063 | 199420 |
-| id094 | 200127 |
-| id044 | 198886 |
-...
-| id093 | 200132 |
-| id003 | 199047 |
-+-------+--------+
-h2o groupby query 1 took 1669 ms
+
+## h2o benchmarks for groupby
+
+### Generate data for h2o benchmarks
+There are three options for generating data for h2o benchmarks: `small`, `medium`, and `big`. The data is generated in the `data` directory.
+
+1. Generate small data (1e7 rows)
+```bash
+./bench.sh data h2o_small
+```
+
+
+2. Generate medium data (1e8 rows)
+```bash
+./bench.sh data h2o_medium
+```
+
+
+3. Generate large data (1e9 rows)
+```bash
+./bench.sh data h2o_big
+```
+
+### Run h2o benchmarks
+There are three options for running h2o benchmarks: `small`, `medium`, and `big`.
+1. Run small data benchmark
+```bash
+./bench.sh run h2o_small
+```
+
+2. Run medium data benchmark
+```bash
+./bench.sh run h2o_medium
+```
+
+3. Run large data benchmark
+```bash
+./bench.sh run h2o_big
+```
+
+4. Run a specific query with a specific data path
+
+For example, to run query 1 with the small data generated above:
+```bash
+cargo run --release --bin dfbench -- h2o --path ./benchmarks/data/h2o/G1_1e7_1e7_100_0.csv --query 1
```
[1]: http://www.tpc.org/tpch/
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 70faa9ef2b737..20cb32722c879 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -75,9 +75,14 @@ tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
+sort_tpch: Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
+external_aggr: External aggregation benchmark
+h2o_small: h2oai benchmark with small dataset (1e7 rows), default file format is csv
+h2o_medium: h2oai benchmark with medium dataset (1e8 rows), default file format is csv
+h2o_big: h2oai benchmark with large dataset (1e9 rows), default file format is csv
**********
* Supported Configuration (Environment Variables)
@@ -140,6 +145,9 @@ main() {
all)
data_tpch "1"
data_tpch "10"
+ data_h2o "SMALL"
+ data_h2o "MEDIUM"
+ data_h2o "BIG"
data_clickbench_1
data_clickbench_partitioned
data_imdb
@@ -170,6 +178,23 @@ main() {
imdb)
data_imdb
;;
+ h2o_small)
+ data_h2o "SMALL" "CSV"
+ ;;
+ h2o_medium)
+ data_h2o "MEDIUM" "CSV"
+ ;;
+ h2o_big)
+ data_h2o "BIG" "CSV"
+ ;;
+ external_aggr)
+ # same data as for tpch
+ data_tpch "1"
+ ;;
+ sort_tpch)
+ # same data as for tpch
+ data_tpch "1"
+ ;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
usage
@@ -211,7 +236,11 @@ main() {
run_clickbench_1
run_clickbench_partitioned
run_clickbench_extended
+ run_h2o "SMALL" "PARQUET" "groupby"
+ run_h2o "MEDIUM" "PARQUET" "groupby"
+ run_h2o "BIG" "PARQUET" "groupby"
run_imdb
+ run_external_aggr
;;
tpch)
run_tpch "1"
@@ -243,6 +272,21 @@ main() {
imdb)
run_imdb
;;
+ h2o_small)
+ run_h2o "SMALL" "CSV" "groupby"
+ ;;
+ h2o_medium)
+ run_h2o "MEDIUM" "CSV" "groupby"
+ ;;
+ h2o_big)
+ run_h2o "BIG" "CSV" "groupby"
+ ;;
+ external_aggr)
+ run_external_aggr
+ ;;
+ sort_tpch)
+ run_sort_tpch
+ ;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
@@ -357,7 +401,7 @@ run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
- $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+ $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
}
# Runs the sort benchmark
@@ -365,7 +409,7 @@ run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
- $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
+ $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
}
@@ -439,11 +483,11 @@ run_clickbench_extended() {
}
# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)
-# http://homepages.cwi.nl/~boncz/job/imdb.tgz
+# https://event.cwi.nl/da/job/imdb.tgz
data_imdb() {
local imdb_dir="${DATA_DIR}/imdb"
local imdb_temp_gz="${imdb_dir}/imdb.tgz"
- local imdb_url="https://homepages.cwi.nl/~boncz/job/imdb.tgz"
+ local imdb_url="https://event.cwi.nl/da/job/imdb.tgz"
# imdb has 21 files, we just separate them into 3 groups for better readability
local first_required_files=(
@@ -524,7 +568,150 @@ run_imdb() {
$CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
}
+data_h2o() {
+ # Default values for size and data format
+ SIZE=${1:-"SMALL"}
+ DATA_FORMAT=${2:-"CSV"}
+
+ # Function to compare Python versions
+ version_ge() {
+ [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
+ }
+
+ export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
+
+ # Find the highest available Python version (3.10 or higher)
+ REQUIRED_VERSION="3.10"
+ PYTHON_CMD=$(command -v python3 || true)
+
+ if [ -n "$PYTHON_CMD" ]; then
+ PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+ if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
+ echo "Found Python version $PYTHON_VERSION, which is suitable."
+ else
+ echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
+ PYTHON_CMD=""
+ fi
+ fi
+
+ # Search for suitable Python versions if the default is unsuitable
+ if [ -z "$PYTHON_CMD" ]; then
+ # Loop through all available Python3 commands on the system
+ for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
+ if command -v "$CMD" &> /dev/null; then
+ PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+ if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
+ PYTHON_CMD="$CMD"
+ echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
+ break
+ fi
+ fi
+ done
+ fi
+
+ # If no suitable Python version found, exit with an error
+ if [ -z "$PYTHON_CMD" ]; then
+ echo "Python 3.10 or higher is required. Please install it."
+ return 1
+ fi
+
+ echo "Using Python command: $PYTHON_CMD"
+
+ # Install falsa and other dependencies
+ echo "Installing falsa..."
+
+ # Set virtual environment directory
+ VIRTUAL_ENV="${PWD}/venv"
+
+ # Create a virtual environment using the detected Python command
+ $PYTHON_CMD -m venv "$VIRTUAL_ENV"
+
+ # Activate the virtual environment and install dependencies
+ source "$VIRTUAL_ENV/bin/activate"
+
+ # Ensure 'falsa' is installed (avoid unnecessary reinstall)
+ pip install --quiet --upgrade falsa
+
+ # Create directory if it doesn't exist
+ H2O_DIR="${DATA_DIR}/h2o"
+ mkdir -p "${H2O_DIR}"
+
+ # Generate h2o test data
+ echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
+ falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
+
+ # Deactivate virtual environment after completion
+ deactivate
+}
+
+## todo now only support groupby, after https://github.com/mrpowers-io/falsa/issues/21 done, we can add support for join
+run_h2o() {
+ # Default values for size and data format
+ SIZE=${1:-"SMALL"}
+ DATA_FORMAT=${2:-"CSV"}
+ DATA_FORMAT=$(echo "$DATA_FORMAT" | tr '[:upper:]' '[:lower:]')
+ RUN_Type=${3:-"groupby"}
+
+ # Data directory and results file path
+ H2O_DIR="${DATA_DIR}/h2o"
+ RESULTS_FILE="${RESULTS_DIR}/h2o.json"
+
+ echo "RESULTS_FILE: ${RESULTS_FILE}"
+ echo "Running h2o benchmark..."
+ # Set the file name based on the size
+ case "$SIZE" in
+ "SMALL")
+ FILE_NAME="G1_1e7_1e7_100_0.${DATA_FORMAT}" # For small dataset
+ ;;
+ "MEDIUM")
+ FILE_NAME="G1_1e8_1e8_100_0.${DATA_FORMAT}" # For medium dataset
+ ;;
+ "BIG")
+ FILE_NAME="G1_1e9_1e9_100_0.${DATA_FORMAT}" # For big dataset
+ ;;
+ *)
+ echo "Invalid size. Valid options are SMALL, MEDIUM, or BIG."
+ return 1
+ ;;
+ esac
+
+ # Set the query file name based on the RUN_Type
+ QUERY_FILE="${SCRIPT_DIR}/queries/h2o/${RUN_Type}.sql"
+
+ # Run the benchmark using the dynamically constructed file path and query file
+ $CARGO_COMMAND --bin dfbench -- h2o \
+ --iterations 3 \
+ --path "${H2O_DIR}/${FILE_NAME}" \
+ --queries-path "${QUERY_FILE}" \
+ -o "${RESULTS_FILE}"
+}
+
+# Runs the external aggregation benchmark
+run_external_aggr() {
+ # Use TPC-H SF1 dataset
+ TPCH_DIR="${DATA_DIR}/tpch_sf1"
+ RESULTS_FILE="${RESULTS_DIR}/external_aggr.json"
+ echo "RESULTS_FILE: ${RESULTS_FILE}"
+ echo "Running external aggregation benchmark..."
+
+ # Only parquet is supported.
+ # Since per-operator memory limit is calculated as (total-memory-limit /
+ # number-of-partitions), and by default `--partitions` is set to number of
+ # CPU cores, we set a constant number of partitions to prevent this
+ # benchmark to fail on some machines.
+ $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+}
+
+# Runs the sort integration benchmark
+run_sort_tpch() {
+ TPCH_DIR="${DATA_DIR}/tpch_sf1"
+ RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
+ echo "RESULTS_FILE: ${RESULTS_FILE}"
+ echo "Running sort tpch benchmark..."
+
+ $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+}
compare_benchmarks() {
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 2574c0735ca8d..4b609c744d503 100755
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -1,21 +1,20 @@
#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
from __future__ import annotations
diff --git a/benchmarks/queries/h2o/groupby.sql b/benchmarks/queries/h2o/groupby.sql
new file mode 100644
index 0000000000000..c2101ef8ada2d
--- /dev/null
+++ b/benchmarks/queries/h2o/groupby.sql
@@ -0,0 +1,10 @@
+SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1;
+SELECT id1, id2, SUM(v1) AS v1 FROM x GROUP BY id1, id2;
+SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3;
+SELECT id4, AVG(v1) AS v1, AVG(v2) AS v2, AVG(v3) AS v3 FROM x GROUP BY id4;
+SELECT id6, SUM(v1) AS v1, SUM(v2) AS v2, SUM(v3) AS v3 FROM x GROUP BY id6;
+SELECT id4, id5, MEDIAN(v3) AS median_v3, STDDEV(v3) AS sd_v3 FROM x GROUP BY id4, id5;
+SELECT id3, MAX(v1) - MIN(v2) AS range_v1_v2 FROM x GROUP BY id3;
+SELECT id6, largest2_v3 FROM (SELECT id6, v3 AS largest2_v3, ROW_NUMBER() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS order_v3 FROM x WHERE v3 IS NOT NULL) sub_query WHERE order_v3 <= 2;
+SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM x GROUP BY id2, id4;
+SELECT id1, id2, id3, id4, id5, id6, SUM(v3) AS v3, COUNT(*) AS count FROM x GROUP BY id1, id2, id3, id4, id5, id6;
diff --git a/benchmarks/queries/h2o/join.sql b/benchmarks/queries/h2o/join.sql
new file mode 100644
index 0000000000000..8546b9292dbb4
--- /dev/null
+++ b/benchmarks/queries/h2o/join.sql
@@ -0,0 +1,5 @@
+SELECT x.id1, x.id2, x.id3, x.id4 as xid4, small.id4 as smallid4, x.id5, x.id6, x.v1, small.v2 FROM x INNER JOIN small ON x.id1 = small.id1;
+SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x INNER JOIN medium ON x.id2 = medium.id2;
+SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x LEFT JOIN medium ON x.id2 = medium.id2;
+SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x JOIN medium ON x.id5 = medium.id5;
+SELECT x.id1 as xid1, large.id1 as largeid1, x.id2 as xid2, large.id2 as largeid2, x.id3, x.id4 as xid4, large.id4 as largeid4, x.id5 as xid5, large.id5 as largeid5, x.id6 as xid6, large.id6 as largeid6, x.v1, large.v2 FROM x JOIN large ON x.id3 = large.id3;
diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
index f7b84116e793a..db6c29f4a46a6 100644
--- a/benchmarks/src/bin/dfbench.rs
+++ b/benchmarks/src/bin/dfbench.rs
@@ -33,7 +33,9 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
-use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch};
+use datafusion_benchmarks::{
+ clickbench, h2o, imdb, parquet_filter, sort, sort_tpch, tpch,
+};
#[derive(Debug, StructOpt)]
#[structopt(about = "benchmark command")]
@@ -43,7 +45,9 @@ enum Options {
Clickbench(clickbench::RunOpt),
ParquetFilter(parquet_filter::RunOpt),
Sort(sort::RunOpt),
+ SortTpch(sort_tpch::RunOpt),
Imdb(imdb::RunOpt),
+ H2o(h2o::RunOpt),
}
// Main benchmark runner entrypoint
@@ -57,6 +61,8 @@ pub async fn main() -> Result<()> {
Options::Clickbench(opt) => opt.run().await,
Options::ParquetFilter(opt) => opt.run().await,
Options::Sort(opt) => opt.run().await,
+ Options::SortTpch(opt) => opt.run().await,
Options::Imdb(opt) => opt.run().await,
+ Options::H2o(opt) => opt.run().await,
}
}
diff --git a/benchmarks/src/bin/external_aggr.rs b/benchmarks/src/bin/external_aggr.rs
new file mode 100644
index 0000000000000..a2fb75dd19418
--- /dev/null
+++ b/benchmarks/src/bin/external_aggr.rs
@@ -0,0 +1,389 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! external_aggr binary entrypoint
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::LazyLock;
+use structopt::StructOpt;
+
+use arrow::record_batch::RecordBatch;
+use arrow::util::pretty;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::{
+ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+};
+use datafusion::datasource::{MemTable, TableProvider};
+use datafusion::error::Result;
+use datafusion::execution::memory_pool::FairSpillPool;
+use datafusion::execution::memory_pool::{human_readable_size, units};
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::physical_plan::display::DisplayableExecutionPlan;
+use datafusion::physical_plan::{collect, displayable};
+use datafusion::prelude::*;
+use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
+use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
+use datafusion_common::{exec_datafusion_err, exec_err, DEFAULT_PARQUET_EXTENSION};
+
+#[derive(Debug, StructOpt)]
+#[structopt(
+ name = "datafusion-external-aggregation",
+ about = "DataFusion external aggregation benchmark"
+)]
+enum ExternalAggrOpt {
+ Benchmark(ExternalAggrConfig),
+}
+
+#[derive(Debug, StructOpt)]
+struct ExternalAggrConfig {
+ /// Query number. If not specified, runs all queries
+ #[structopt(short, long)]
+ query: Option,
+
+ /// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query.
+ #[structopt(long)]
+ memory_limit: Option,
+
+ /// Common options
+ #[structopt(flatten)]
+ common: CommonOpt,
+
+ /// Path to data files (lineitem). Only parquet format is supported
+ #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+ path: PathBuf,
+
+ /// Load the data into a MemTable before executing the query
+ #[structopt(short = "m", long = "mem-table")]
+ mem_table: bool,
+
+ /// Path to JSON benchmark result to be compare using `compare.py`
+ #[structopt(parse(from_os_str), short = "o", long = "output")]
+ output_path: Option,
+}
+
+struct QueryResult {
+ elapsed: std::time::Duration,
+ row_count: usize,
+}
+
+/// Query Memory Limits
+/// Map query id to predefined memory limits
+///
+/// Q1 requires 36MiB for aggregation
+/// Memory limits to run: 64MiB, 32MiB, 16MiB
+/// Q2 requires 250MiB for aggregation
+/// Memory limits to run: 512MiB, 256MiB, 128MiB, 64MiB, 32MiB
+static QUERY_MEMORY_LIMITS: LazyLock>> = LazyLock::new(|| {
+ use units::*;
+ let mut map = HashMap::new();
+ map.insert(1, vec![64 * MB, 32 * MB, 16 * MB]);
+ map.insert(2, vec![512 * MB, 256 * MB, 128 * MB, 64 * MB, 32 * MB]);
+ map
+});
+
+impl ExternalAggrConfig {
+ const AGGR_TABLES: [&'static str; 1] = ["lineitem"];
+ const AGGR_QUERIES: [&'static str; 2] = [
+ // Q1: Output size is ~25% of lineitem table
+ r#"
+ SELECT count(*)
+ FROM (
+ SELECT DISTINCT l_orderkey
+ FROM lineitem
+ )
+ "#,
+ // Q2: Output size is ~99% of lineitem table
+ r#"
+ SELECT count(*)
+ FROM (
+ SELECT DISTINCT l_orderkey, l_suppkey
+ FROM lineitem
+ )
+ "#,
+ ];
+
+ /// If `--query` and `--memory-limit` is not speicified, run all queries
+ /// with pre-configured memory limits
+ /// If only `--query` is specified, run the query with all memory limits
+ /// for this query
+ /// If both `--query` and `--memory-limit` are specified, run the query
+ /// with the specified memory limit
+ pub async fn run(&self) -> Result<()> {
+ let mut benchmark_run = BenchmarkRun::new();
+
+ let memory_limit = match &self.memory_limit {
+ Some(limit) => Some(Self::parse_memory_limit(limit)?),
+ None => None,
+ };
+
+ let query_range = match self.query {
+ Some(query_id) => query_id..=query_id,
+ None => 1..=Self::AGGR_QUERIES.len(),
+ };
+
+ // Each element is (query_id, memory_limit)
+ // e.g. [(1, 64_000), (1, 32_000)...] means first run Q1 with 64KiB
+ // memory limit, next run Q1 with 32KiB memory limit, etc.
+ let mut query_executions = vec![];
+ // Setup `query_executions`
+ for query_id in query_range {
+ if query_id > Self::AGGR_QUERIES.len() {
+ return exec_err!(
+ "Invalid '--query'(query number) {} for external aggregation benchmark.",
+ query_id
+ );
+ }
+
+ match memory_limit {
+ Some(limit) => {
+ query_executions.push((query_id, limit));
+ }
+ None => {
+ let memory_limits = QUERY_MEMORY_LIMITS.get(&query_id).unwrap();
+ for limit in memory_limits {
+ query_executions.push((query_id, *limit));
+ }
+ }
+ }
+ }
+
+ for (query_id, mem_limit) in query_executions {
+ benchmark_run.start_new_case(&format!(
+ "{query_id}({})",
+ human_readable_size(mem_limit as usize)
+ ));
+
+ let query_results = self.benchmark_query(query_id, mem_limit).await?;
+ for iter in query_results {
+ benchmark_run.write_iter(iter.elapsed, iter.row_count);
+ }
+ }
+
+ benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+
+ Ok(())
+ }
+
+ /// Benchmark query `query_id` in `AGGR_QUERIES`
+ async fn benchmark_query(
+ &self,
+ query_id: usize,
+ mem_limit: u64,
+ ) -> Result> {
+ let query_name =
+ format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
+ let config = self.common.config();
+ let runtime_env = RuntimeEnvBuilder::new()
+ .with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
+ .build_arc()?;
+ let state = SessionStateBuilder::new()
+ .with_config(config)
+ .with_runtime_env(runtime_env)
+ .with_default_features()
+ .build();
+ let ctx = SessionContext::from(state);
+
+ // register tables
+ self.register_tables(&ctx).await?;
+
+ let mut millis = vec![];
+ // run benchmark
+ let mut query_results = vec![];
+ for i in 0..self.iterations() {
+ let start = Instant::now();
+
+ let query_idx = query_id - 1; // 1-indexed -> 0-indexed
+ let sql = Self::AGGR_QUERIES[query_idx];
+
+ let result = self.execute_query(&ctx, sql).await?;
+
+ let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0;
+ let ms = elapsed.as_secs_f64() * 1000.0;
+ millis.push(ms);
+
+ let row_count = result.iter().map(|b| b.num_rows()).sum();
+ println!(
+ "{query_name} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+ );
+ query_results.push(QueryResult { elapsed, row_count });
+ }
+
+ let avg = millis.iter().sum::() / millis.len() as f64;
+ println!("{query_name} avg time: {avg:.2} ms");
+
+ Ok(query_results)
+ }
+
+ async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
+ for table in Self::AGGR_TABLES {
+ let table_provider = { self.get_table(ctx, table).await? };
+
+ if self.mem_table {
+ println!("Loading table '{table}' into memory");
+ let start = Instant::now();
+ let memtable =
+ MemTable::load(table_provider, Some(self.partitions()), &ctx.state())
+ .await?;
+ println!(
+ "Loaded table '{}' into memory in {} ms",
+ table,
+ start.elapsed().as_millis()
+ );
+ ctx.register_table(table, Arc::new(memtable))?;
+ } else {
+ ctx.register_table(table, table_provider)?;
+ }
+ }
+ Ok(())
+ }
+
+ async fn execute_query(
+ &self,
+ ctx: &SessionContext,
+ sql: &str,
+ ) -> Result> {
+ let debug = self.common.debug;
+ let plan = ctx.sql(sql).await?;
+ let (state, plan) = plan.into_parts();
+
+ if debug {
+ println!("=== Logical plan ===\n{plan}\n");
+ }
+
+ let plan = state.optimize(&plan)?;
+ if debug {
+ println!("=== Optimized logical plan ===\n{plan}\n");
+ }
+ let physical_plan = state.create_physical_plan(&plan).await?;
+ if debug {
+ println!(
+ "=== Physical plan ===\n{}\n",
+ displayable(physical_plan.as_ref()).indent(true)
+ );
+ }
+ let result = collect(physical_plan.clone(), state.task_ctx()).await?;
+ if debug {
+ println!(
+ "=== Physical plan with metrics ===\n{}\n",
+ DisplayableExecutionPlan::with_metrics(physical_plan.as_ref())
+ .indent(true)
+ );
+ if !result.is_empty() {
+ // do not call print_batches if there are no batches as the result is confusing
+ // and makes it look like there is a batch with no columns
+ pretty::print_batches(&result)?;
+ }
+ }
+ Ok(result)
+ }
+
+ async fn get_table(
+ &self,
+ ctx: &SessionContext,
+ table: &str,
+ ) -> Result> {
+ let path = self.path.to_str().unwrap();
+
+ // Obtain a snapshot of the SessionState
+ let state = ctx.state();
+ let path = format!("{path}/{table}");
+ let format = Arc::new(
+ ParquetFormat::default()
+ .with_options(ctx.state().table_options().parquet.clone()),
+ );
+ let extension = DEFAULT_PARQUET_EXTENSION;
+
+ let options = ListingOptions::new(format)
+ .with_file_extension(extension)
+ .with_collect_stat(state.config().collect_statistics());
+
+ let table_path = ListingTableUrl::parse(path)?;
+ let config = ListingTableConfig::new(table_path).with_listing_options(options);
+ let config = config.infer_schema(&state).await?;
+
+ Ok(Arc::new(ListingTable::try_new(config)?))
+ }
+
+ fn iterations(&self) -> usize {
+ self.common.iterations
+ }
+
+ fn partitions(&self) -> usize {
+ self.common
+ .partitions
+ .unwrap_or(get_available_parallelism())
+ }
+
+ /// Parse memory limit from string to number of bytes
+ /// e.g. '1.5G', '100M' -> 1572864
+ fn parse_memory_limit(limit: &str) -> Result {
+ let (number, unit) = limit.split_at(limit.len() - 1);
+ let number: f64 = number.parse().map_err(|_| {
+ exec_datafusion_err!("Failed to parse number from memory limit '{}'", limit)
+ })?;
+
+ match unit {
+ "K" => Ok((number * 1024.0) as u64),
+ "M" => Ok((number * 1024.0 * 1024.0) as u64),
+ "G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as u64),
+ _ => exec_err!("Unsupported unit '{}' in memory limit '{}'", unit, limit),
+ }
+ }
+}
+
+#[tokio::main]
+pub async fn main() -> Result<()> {
+ env_logger::init();
+
+ match ExternalAggrOpt::from_args() {
+ ExternalAggrOpt::Benchmark(opt) => opt.run().await?,
+ }
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_parse_memory_limit_all() {
+ // Test valid inputs
+ assert_eq!(
+ ExternalAggrConfig::parse_memory_limit("100K").unwrap(),
+ 102400
+ );
+ assert_eq!(
+ ExternalAggrConfig::parse_memory_limit("1.5M").unwrap(),
+ 1572864
+ );
+ assert_eq!(
+ ExternalAggrConfig::parse_memory_limit("2G").unwrap(),
+ 2147483648
+ );
+
+ // Test invalid unit
+ assert!(ExternalAggrConfig::parse_memory_limit("500X").is_err());
+
+ // Test invalid number
+ assert!(ExternalAggrConfig::parse_memory_limit("abcM").is_err());
+ }
+}
diff --git a/benchmarks/src/bin/h2o.rs b/benchmarks/src/bin/h2o.rs
deleted file mode 100644
index 1bb8cb9d43e4b..0000000000000
--- a/benchmarks/src/bin/h2o.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! DataFusion h2o benchmarks
-
-use datafusion::arrow::datatypes::{DataType, Field, Schema};
-use datafusion::config::ConfigOptions;
-use datafusion::datasource::file_format::csv::CsvFormat;
-use datafusion::datasource::listing::{
- ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
-};
-use datafusion::datasource::MemTable;
-use datafusion::prelude::CsvReadOptions;
-use datafusion::{arrow::util::pretty, error::Result, prelude::SessionContext};
-use datafusion_benchmarks::BenchmarkRun;
-use std::path::PathBuf;
-use std::sync::Arc;
-use structopt::StructOpt;
-use tokio::time::Instant;
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "datafusion-h2o", about = "DataFusion h2o benchmarks")]
-enum Opt {
- GroupBy(GroupBy), //TODO add Join queries
-}
-
-#[derive(Debug, StructOpt)]
-struct GroupBy {
- /// Query number
- #[structopt(short, long)]
- query: usize,
- /// Path to data file
- #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
- path: PathBuf,
- /// Activate debug mode to see query results
- #[structopt(short, long)]
- debug: bool,
- /// Load the data into a MemTable before executing the query
- #[structopt(short = "m", long = "mem-table")]
- mem_table: bool,
- /// Path to machine readable output file
- #[structopt(parse(from_os_str), short = "o", long = "output")]
- output_path: Option,
-}
-
-#[tokio::main]
-async fn main() -> Result<()> {
- let opt = Opt::from_args();
- println!("Running benchmarks with the following options: {opt:?}");
- match opt {
- Opt::GroupBy(config) => group_by(&config).await,
- }
-}
-
-async fn group_by(opt: &GroupBy) -> Result<()> {
- let mut rundata = BenchmarkRun::new();
- let path = opt.path.to_str().unwrap();
- let mut config = ConfigOptions::from_env()?;
- config.execution.batch_size = 65535;
-
- let ctx = SessionContext::new_with_config(config.into());
-
- let schema = Schema::new(vec![
- Field::new("id1", DataType::Utf8, false),
- Field::new("id2", DataType::Utf8, false),
- Field::new("id3", DataType::Utf8, false),
- Field::new("id4", DataType::Int32, false),
- Field::new("id5", DataType::Int32, false),
- Field::new("id6", DataType::Int32, false),
- Field::new("v1", DataType::Int32, false),
- Field::new("v2", DataType::Int32, false),
- Field::new("v3", DataType::Float64, false),
- ]);
-
- if opt.mem_table {
- let listing_config = ListingTableConfig::new(ListingTableUrl::parse(path)?)
- .with_listing_options(ListingOptions::new(Arc::new(CsvFormat::default())))
- .with_schema(Arc::new(schema));
- let csv = ListingTable::try_new(listing_config)?;
- let partition_size = num_cpus::get();
- let memtable =
- MemTable::load(Arc::new(csv), Some(partition_size), &ctx.state()).await?;
- ctx.register_table("x", Arc::new(memtable))?;
- } else {
- ctx.register_csv("x", path, CsvReadOptions::default().schema(&schema))
- .await?;
- }
- rundata.start_new_case(&opt.query.to_string());
- let sql = match opt.query {
- 1 => "select id1, sum(v1) as v1 from x group by id1",
- 2 => "select id1, id2, sum(v1) as v1 from x group by id1, id2",
- 3 => "select id3, sum(v1) as v1, mean(v3) as v3 from x group by id3",
- 4 => "select id4, mean(v1) as v1, mean(v2) as v2, mean(v3) as v3 from x group by id4",
- 5 => "select id6, sum(v1) as v1, sum(v2) as v2, sum(v3) as v3 from x group by id6",
- 6 => "select id4, id5, median(v3) as median_v3, stddev(v3) as sd_v3 from x group by id4, id5",
- 7 => "select id3, max(v1)-min(v2) as range_v1_v2 from x group by id3",
- 8 => "select id6, largest2_v3 from (select id6, v3 as largest2_v3, row_number() over (partition by id6 order by v3 desc) as order_v3 from x where v3 is not null) sub_query where order_v3 <= 2",
- 9 => "select id2, id4, pow(corr(v1, v2), 2) as r2 from x group by id2, id4",
- 10 => "select id1, id2, id3, id4, id5, id6, sum(v3) as v3, count(*) as count from x group by id1, id2, id3, id4, id5, id6",
- _ => unimplemented!(),
- };
-
- println!("Executing {sql}");
- let start = Instant::now();
- let df = ctx.sql(sql).await?;
- let batches = df.collect().await?;
- let elapsed = start.elapsed();
- let numrows = batches.iter().map(|b| b.num_rows()).sum::();
- if opt.debug {
- pretty::print_batches(&batches)?;
- }
- rundata.write_iter(elapsed, numrows);
- println!(
- "h2o groupby query {} took {} ms",
- opt.query,
- elapsed.as_secs_f64() * 1000.0
- );
- rundata.maybe_write_json(opt.output_path.as_ref())?;
- Ok(())
-}
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
index 207da4020b588..6b7c75ed4babc 100644
--- a/benchmarks/src/clickbench.rs
+++ b/benchmarks/src/clickbench.rs
@@ -18,6 +18,7 @@
use std::path::Path;
use std::path::PathBuf;
+use crate::util::{BenchmarkRun, CommonOpt};
use datafusion::{
error::{DataFusionError, Result},
prelude::SessionContext,
@@ -26,8 +27,6 @@ use datafusion_common::exec_datafusion_err;
use datafusion_common::instant::Instant;
use structopt::StructOpt;
-use crate::{BenchmarkRun, CommonOpt};
-
/// Run the clickbench benchmark
///
/// The ClickBench[1] benchmarks are widely cited in the industry and
@@ -116,12 +115,14 @@ impl RunOpt {
None => queries.min_query_id()..=queries.max_query_id(),
};
+ // configure parquet options
let mut config = self.common.config();
- config
- .options_mut()
- .execution
- .parquet
- .schema_force_view_types = self.common.force_view_types;
+ {
+ let parquet_options = &mut config.options_mut().execution.parquet;
+ // The hits_partitioned dataset specifies string columns
+ // as binary due to how it was written. Force it to strings
+ parquet_options.binary_as_string = true;
+ }
let ctx = SessionContext::new_with_config(config);
self.register_hits(&ctx).await?;
@@ -144,12 +145,15 @@ impl RunOpt {
);
benchmark_run.write_iter(elapsed, row_count);
}
+ if self.common.debug {
+ ctx.sql(sql).await?.explain(false, false)?.show().await?;
+ }
}
benchmark_run.maybe_write_json(self.output_path.as_ref())?;
Ok(())
}
- /// Registrs the `hits.parquet` as a table named `hits`
+ /// Registers the `hits.parquet` as a table named `hits`
async fn register_hits(&self, ctx: &SessionContext) -> Result<()> {
let options = Default::default();
let path = self.path.as_os_str().to_str().unwrap();
diff --git a/benchmarks/src/h2o.rs b/benchmarks/src/h2o.rs
new file mode 100644
index 0000000000000..53a516ceb56d4
--- /dev/null
+++ b/benchmarks/src/h2o.rs
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt};
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::{exec_datafusion_err, instant::Instant, DataFusionError};
+use std::path::{Path, PathBuf};
+use structopt::StructOpt;
+
+/// Run the H2O benchmark
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+ #[structopt(short, long)]
+ query: Option,
+
+ /// Common options
+ #[structopt(flatten)]
+ common: CommonOpt,
+
+ /// Path to queries.sql (single file)
+ /// default value is the groupby.sql file in the h2o benchmark
+ #[structopt(
+ parse(from_os_str),
+ short = "r",
+ long = "queries-path",
+ default_value = "benchmarks/queries/h2o/groupby.sql"
+ )]
+ queries_path: PathBuf,
+
+ /// Path to data file (parquet or csv)
+ /// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark
+ /// This is the small csv file with 10^7 rows
+ #[structopt(
+ parse(from_os_str),
+ short = "p",
+ long = "path",
+ default_value = "benchmarks/data/h2o/G1_1e7_1e7_100_0.csv"
+ )]
+ path: PathBuf,
+
+ /// If present, write results json here
+ #[structopt(parse(from_os_str), short = "o", long = "output")]
+ output_path: Option,
+}
+
+impl RunOpt {
+ pub async fn run(self) -> Result<()> {
+ println!("Running benchmarks with the following options: {self:?}");
+ let queries = AllQueries::try_new(&self.queries_path)?;
+ let query_range = match self.query {
+ Some(query_id) => query_id..=query_id,
+ None => queries.min_query_id()..=queries.max_query_id(),
+ };
+
+ let config = self.common.config();
+ let ctx = SessionContext::new_with_config(config);
+
+ // Register data
+ self.register_data(&ctx).await?;
+
+ let iterations = self.common.iterations;
+ let mut benchmark_run = BenchmarkRun::new();
+ for query_id in query_range {
+ benchmark_run.start_new_case(&format!("Query {query_id}"));
+ let sql = queries.get_query(query_id)?;
+ println!("Q{query_id}: {sql}");
+
+ for i in 1..=iterations {
+ let start = Instant::now();
+ let results = ctx.sql(sql).await?.collect().await?;
+ let elapsed = start.elapsed();
+ let ms = elapsed.as_secs_f64() * 1000.0;
+ let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
+ println!(
+ "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+ );
+ benchmark_run.write_iter(elapsed, row_count);
+ }
+ if self.common.debug {
+ ctx.sql(sql).await?.explain(false, false)?.show().await?;
+ }
+ benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+ }
+
+ Ok(())
+ }
+
+ async fn register_data(&self, ctx: &SessionContext) -> Result<()> {
+ let csv_options = Default::default();
+ let parquet_options = Default::default();
+ let path = self.path.as_os_str().to_str().unwrap();
+
+ if self.path.extension().map(|s| s == "csv").unwrap_or(false) {
+ ctx.register_csv("x", path, csv_options)
+ .await
+ .map_err(|e| {
+ DataFusionError::Context(
+ format!("Registering 'table' as {path}"),
+ Box::new(e),
+ )
+ })
+ .expect("error registering csv");
+ }
+
+ if self
+ .path
+ .extension()
+ .map(|s| s == "parquet")
+ .unwrap_or(false)
+ {
+ ctx.register_parquet("x", path, parquet_options)
+ .await
+ .map_err(|e| {
+ DataFusionError::Context(
+ format!("Registering 'table' as {path}"),
+ Box::new(e),
+ )
+ })
+ .expect("error registering parquet");
+ }
+ Ok(())
+ }
+}
+
+struct AllQueries {
+ queries: Vec,
+}
+
+impl AllQueries {
+ fn try_new(path: &Path) -> Result {
+ let all_queries = std::fs::read_to_string(path)
+ .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
+
+ Ok(Self {
+ queries: all_queries.lines().map(|s| s.to_string()).collect(),
+ })
+ }
+
+ /// Returns the text of query `query_id`
+ fn get_query(&self, query_id: usize) -> Result<&str> {
+ self.queries
+ .get(query_id - 1)
+ .ok_or_else(|| {
+ let min_id = self.min_query_id();
+ let max_id = self.max_query_id();
+ exec_datafusion_err!(
+ "Invalid query id {query_id}. Must be between {min_id} and {max_id}"
+ )
+ })
+ .map(|s| s.as_str())
+ }
+
+ fn min_query_id(&self) -> usize {
+ 1
+ }
+
+ fn max_query_id(&self) -> usize {
+ self.queries.len()
+ }
+}
diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs
index 697c79dc894a4..8d2317c62ef11 100644
--- a/benchmarks/src/imdb/run.rs
+++ b/benchmarks/src/imdb/run.rs
@@ -19,7 +19,7 @@ use std::path::PathBuf;
use std::sync::Arc;
use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
-use crate::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt};
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::{self, pretty_format_batches};
@@ -35,6 +35,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion::prelude::*;
use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
use log::info;
@@ -305,11 +306,7 @@ impl RunOpt {
.config()
.with_collect_statistics(!self.disable_statistics);
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
- config
- .options_mut()
- .execution
- .parquet
- .schema_force_view_types = self.common.force_view_types;
+
let ctx = SessionContext::new_with_config(config);
// register tables
@@ -472,7 +469,9 @@ impl RunOpt {
}
fn partitions(&self) -> usize {
- self.common.partitions.unwrap_or(num_cpus::get())
+ self.common
+ .partitions
+ .unwrap_or(get_available_parallelism())
}
}
@@ -489,6 +488,7 @@ mod tests {
use super::*;
+ use crate::util::CommonOpt;
use datafusion::common::exec_err;
use datafusion::error::Result;
use datafusion_proto::bytes::{
@@ -516,7 +516,6 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
- force_view_types: false,
};
let opt = RunOpt {
query: Some(query),
@@ -550,7 +549,6 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
- force_view_types: false,
};
let opt = RunOpt {
query: Some(query),
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
index 52d81ca91816a..858a5b9df7f86 100644
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@@ -17,9 +17,10 @@
//! DataFusion benchmark runner
pub mod clickbench;
+pub mod h2o;
pub mod imdb;
pub mod parquet_filter;
pub mod sort;
+pub mod sort_tpch;
pub mod tpch;
-mod util;
-pub use util::*;
+pub mod util;
diff --git a/benchmarks/src/parquet_filter.rs b/benchmarks/src/parquet_filter.rs
index 5c98a2f8be3de..34103af0ffd21 100644
--- a/benchmarks/src/parquet_filter.rs
+++ b/benchmarks/src/parquet_filter.rs
@@ -17,7 +17,7 @@
use std::path::PathBuf;
-use crate::{AccessLogOpt, BenchmarkRun, CommonOpt};
+use crate::util::{AccessLogOpt, BenchmarkRun, CommonOpt};
use arrow::util::pretty;
use datafusion::common::Result;
diff --git a/benchmarks/src/sort.rs b/benchmarks/src/sort.rs
index 19eec2949ef61..9cf09c57205a7 100644
--- a/benchmarks/src/sort.rs
+++ b/benchmarks/src/sort.rs
@@ -18,17 +18,17 @@
use std::path::PathBuf;
use std::sync::Arc;
-use crate::{AccessLogOpt, BenchmarkRun, CommonOpt};
+use crate::util::{AccessLogOpt, BenchmarkRun, CommonOpt};
use arrow::util::pretty;
use datafusion::common::Result;
-use datafusion::physical_expr::PhysicalSortExpr;
+use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr};
use datafusion::physical_plan::collect;
use datafusion::physical_plan::sorts::sort::SortExec;
use datafusion::prelude::{SessionConfig, SessionContext};
use datafusion::test_util::parquet::TestParquetFile;
use datafusion_common::instant::Instant;
-
+use datafusion_common::utils::get_available_parallelism;
use structopt::StructOpt;
/// Test performance of sorting large datasets
@@ -70,31 +70,28 @@ impl RunOpt {
let sort_cases = vec![
(
"sort utf8",
- vec![PhysicalSortExpr {
+ LexOrdering::new(vec![PhysicalSortExpr {
expr: col("request_method", &schema)?,
options: Default::default(),
- }],
+ }]),
),
(
"sort int",
- vec![PhysicalSortExpr {
- expr: col("request_bytes", &schema)?,
+ LexOrdering::new(vec![PhysicalSortExpr {
+ expr: col("response_bytes", &schema)?,
options: Default::default(),
- }],
+ }]),
),
(
"sort decimal",
- vec![
- // sort decimal
- PhysicalSortExpr {
- expr: col("decimal_price", &schema)?,
- options: Default::default(),
- },
- ],
+ LexOrdering::new(vec![PhysicalSortExpr {
+ expr: col("decimal_price", &schema)?,
+ options: Default::default(),
+ }]),
),
(
"sort integer tuple",
- vec![
+ LexOrdering::new(vec![
PhysicalSortExpr {
expr: col("request_bytes", &schema)?,
options: Default::default(),
@@ -103,11 +100,11 @@ impl RunOpt {
expr: col("response_bytes", &schema)?,
options: Default::default(),
},
- ],
+ ]),
),
(
"sort utf8 tuple",
- vec![
+ LexOrdering::new(vec![
// sort utf8 tuple
PhysicalSortExpr {
expr: col("service", &schema)?,
@@ -125,11 +122,11 @@ impl RunOpt {
expr: col("image", &schema)?,
options: Default::default(),
},
- ],
+ ]),
),
(
"sort mixed tuple",
- vec![
+ LexOrdering::new(vec![
PhysicalSortExpr {
expr: col("service", &schema)?,
options: Default::default(),
@@ -142,7 +139,7 @@ impl RunOpt {
expr: col("decimal_price", &schema)?,
options: Default::default(),
},
- ],
+ ]),
),
];
for (title, expr) in sort_cases {
@@ -150,7 +147,9 @@ impl RunOpt {
rundata.start_new_case(title);
for i in 0..self.common.iterations {
let config = SessionConfig::new().with_target_partitions(
- self.common.partitions.unwrap_or(num_cpus::get()),
+ self.common
+ .partitions
+ .unwrap_or(get_available_parallelism()),
);
let ctx = SessionContext::new_with_config(config);
let (rows, elapsed) =
@@ -170,13 +169,13 @@ impl RunOpt {
async fn exec_sort(
ctx: &SessionContext,
- expr: &[PhysicalSortExpr],
+ expr: &LexOrdering,
test_file: &TestParquetFile,
debug: bool,
) -> Result<(usize, std::time::Duration)> {
let start = Instant::now();
let scan = test_file.create_scan(ctx, None).await?;
- let exec = Arc::new(SortExec::new(expr.to_owned(), scan));
+ let exec = Arc::new(SortExec::new(expr.clone(), scan));
let task_ctx = ctx.task_ctx();
let result = collect(exec, task_ctx).await?;
let elapsed = start.elapsed();
diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs
new file mode 100644
index 0000000000000..566a5ea62c2d0
--- /dev/null
+++ b/benchmarks/src/sort_tpch.rs
@@ -0,0 +1,325 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This module provides integration benchmark for sort operation.
+//! It will run different sort SQL queries on TPCH `lineitem` parquet dataset.
+//!
+//! Another `Sort` benchmark focus on single core execution. This benchmark
+//! runs end-to-end sort queries and test the performance on multiple CPU cores.
+
+use futures::StreamExt;
+use std::path::PathBuf;
+use std::sync::Arc;
+use structopt::StructOpt;
+
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::{
+ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+};
+use datafusion::datasource::{MemTable, TableProvider};
+use datafusion::error::Result;
+use datafusion::execution::SessionStateBuilder;
+use datafusion::physical_plan::display::DisplayableExecutionPlan;
+use datafusion::physical_plan::{displayable, execute_stream};
+use datafusion::prelude::*;
+use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
+use datafusion_common::DEFAULT_PARQUET_EXTENSION;
+
+use crate::util::{BenchmarkRun, CommonOpt};
+
+#[derive(Debug, StructOpt)]
+pub struct RunOpt {
+ /// Common options
+ #[structopt(flatten)]
+ common: CommonOpt,
+
+ /// Sort query number. If not specified, runs all queries
+ #[structopt(short, long)]
+ query: Option,
+
+ /// Path to data files (lineitem). Only parquet format is supported
+ #[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
+ path: PathBuf,
+
+ /// Path to JSON benchmark result to be compare using `compare.py`
+ #[structopt(parse(from_os_str), short = "o", long = "output")]
+ output_path: Option,
+
+ /// Load the data into a MemTable before executing the query
+ #[structopt(short = "m", long = "mem-table")]
+ mem_table: bool,
+}
+
+struct QueryResult {
+ elapsed: std::time::Duration,
+ row_count: usize,
+}
+
+impl RunOpt {
+ const SORT_TABLES: [&'static str; 1] = ["lineitem"];
+
+ /// Sort queries with different characteristics:
+ /// - Sort key with fixed length or variable length (VARCHAR)
+ /// - Sort key with different cardinality
+ /// - Different number of sort keys
+ /// - Different number of payload columns (thin: 1 additional column other
+ /// than sort keys; wide: all columns except sort keys)
+ ///
+ /// DataSet is `lineitem` table in TPCH dataset (16 columns, 6M rows for
+ /// scale factor 1.0, cardinality is counted from SF1 dataset)
+ ///
+ /// Key Columns:
+ /// - Column `l_linenumber`, type: `INTEGER`, cardinality: 7
+ /// - Column `l_suppkey`, type: `BIGINT`, cardinality: 10k
+ /// - Column `l_orderkey`, type: `BIGINT`, cardinality: 1.5M
+ /// - Column `l_comment`, type: `VARCHAR`, cardinality: 4.5M (len is ~26 chars)
+ ///
+ /// Payload Columns:
+ /// - Thin variant: `l_partkey` column with `BIGINT` type (1 column)
+ /// - Wide variant: all columns except for possible key columns (12 columns)
+ const SORT_QUERIES: [&'static str; 10] = [
+ // Q1: 1 sort key (type: INTEGER, cardinality: 7) + 1 payload column
+ r#"
+ SELECT l_linenumber, l_partkey
+ FROM lineitem
+ ORDER BY l_linenumber
+ "#,
+ // Q2: 1 sort key (type: BIGINT, cardinality: 1.5M) + 1 payload column
+ r#"
+ SELECT l_orderkey, l_partkey
+ FROM lineitem
+ ORDER BY l_orderkey
+ "#,
+ // Q3: 1 sort key (type: VARCHAR, cardinality: 4.5M) + 1 payload column
+ r#"
+ SELECT l_comment, l_partkey
+ FROM lineitem
+ ORDER BY l_comment
+ "#,
+ // Q4: 2 sort keys {(BIGINT, 1.5M), (INTEGER, 7)} + 1 payload column
+ r#"
+ SELECT l_orderkey, l_linenumber, l_partkey
+ FROM lineitem
+ ORDER BY l_orderkey, l_linenumber
+ "#,
+ // Q5: 3 sort keys {(INTEGER, 7), (BIGINT, 10k), (BIGINT, 1.5M)} + no payload column
+ r#"
+ SELECT l_linenumber, l_suppkey, l_orderkey
+ FROM lineitem
+ ORDER BY l_linenumber, l_suppkey, l_orderkey
+ "#,
+ // Q6: 3 sort keys {(INTEGER, 7), (BIGINT, 10k), (BIGINT, 1.5M)} + 1 payload column
+ r#"
+ SELECT l_linenumber, l_suppkey, l_orderkey, l_partkey
+ FROM lineitem
+ ORDER BY l_linenumber, l_suppkey, l_orderkey
+ "#,
+ // Q7: 3 sort keys {(INTEGER, 7), (BIGINT, 10k), (BIGINT, 1.5M)} + 12 all other columns
+ r#"
+ SELECT l_linenumber, l_suppkey, l_orderkey,
+ l_partkey, l_quantity, l_extendedprice, l_discount, l_tax,
+ l_returnflag, l_linestatus, l_shipdate, l_commitdate,
+ l_receiptdate, l_shipinstruct, l_shipmode
+ FROM lineitem
+ ORDER BY l_linenumber, l_suppkey, l_orderkey
+ "#,
+ // Q8: 4 sort keys {(BIGINT, 1.5M), (BIGINT, 10k), (INTEGER, 7), (VARCHAR, 4.5M)} + no payload column
+ r#"
+ SELECT l_orderkey, l_suppkey, l_linenumber, l_comment
+ FROM lineitem
+ ORDER BY l_orderkey, l_suppkey, l_linenumber, l_comment
+ "#,
+ // Q9: 4 sort keys {(BIGINT, 1.5M), (BIGINT, 10k), (INTEGER, 7), (VARCHAR, 4.5M)} + 1 payload column
+ r#"
+ SELECT l_orderkey, l_suppkey, l_linenumber, l_comment, l_partkey
+ FROM lineitem
+ ORDER BY l_orderkey, l_suppkey, l_linenumber, l_comment
+ "#,
+ // Q10: 4 sort keys {(BIGINT, 1.5M), (BIGINT, 10k), (INTEGER, 7), (VARCHAR, 4.5M)} + 12 all other columns
+ r#"
+ SELECT l_orderkey, l_suppkey, l_linenumber, l_comment,
+ l_partkey, l_quantity, l_extendedprice, l_discount, l_tax,
+ l_returnflag, l_linestatus, l_shipdate, l_commitdate,
+ l_receiptdate, l_shipinstruct, l_shipmode
+ FROM lineitem
+ ORDER BY l_orderkey, l_suppkey, l_linenumber, l_comment
+ "#,
+ ];
+
+ /// If query is specified from command line, run only that query.
+ /// Otherwise, run all queries.
+ pub async fn run(&self) -> Result<()> {
+ let mut benchmark_run = BenchmarkRun::new();
+
+ let query_range = match self.query {
+ Some(query_id) => query_id..=query_id,
+ None => 1..=Self::SORT_QUERIES.len(),
+ };
+
+ for query_id in query_range {
+ benchmark_run.start_new_case(&format!("{query_id}"));
+
+ let query_results = self.benchmark_query(query_id).await?;
+ for iter in query_results {
+ benchmark_run.write_iter(iter.elapsed, iter.row_count);
+ }
+ }
+
+ benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+
+ Ok(())
+ }
+
+ /// Benchmark query `query_id` in `SORT_QUERIES`
+ async fn benchmark_query(&self, query_id: usize) -> Result> {
+ let config = self.common.config();
+ let state = SessionStateBuilder::new()
+ .with_config(config)
+ .with_default_features()
+ .build();
+ let ctx = SessionContext::from(state);
+
+ // register tables
+ self.register_tables(&ctx).await?;
+
+ let mut millis = vec![];
+ // run benchmark
+ let mut query_results = vec![];
+ for i in 0..self.iterations() {
+ let start = Instant::now();
+
+ let query_idx = query_id - 1; // 1-indexed -> 0-indexed
+ let sql = Self::SORT_QUERIES[query_idx];
+
+ let row_count = self.execute_query(&ctx, sql).await?;
+
+ let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0;
+ let ms = elapsed.as_secs_f64() * 1000.0;
+ millis.push(ms);
+
+ println!(
+ "Q{query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+ );
+ query_results.push(QueryResult { elapsed, row_count });
+ }
+
+ let avg = millis.iter().sum::() / millis.len() as f64;
+ println!("Q{query_id} avg time: {avg:.2} ms");
+
+ Ok(query_results)
+ }
+
+ async fn register_tables(&self, ctx: &SessionContext) -> Result<()> {
+ for table in Self::SORT_TABLES {
+ let table_provider = { self.get_table(ctx, table).await? };
+
+ if self.mem_table {
+ println!("Loading table '{table}' into memory");
+ let start = Instant::now();
+ let memtable =
+ MemTable::load(table_provider, Some(self.partitions()), &ctx.state())
+ .await?;
+ println!(
+ "Loaded table '{}' into memory in {} ms",
+ table,
+ start.elapsed().as_millis()
+ );
+ ctx.register_table(table, Arc::new(memtable))?;
+ } else {
+ ctx.register_table(table, table_provider)?;
+ }
+ }
+ Ok(())
+ }
+
+ async fn execute_query(&self, ctx: &SessionContext, sql: &str) -> Result {
+ let debug = self.common.debug;
+ let plan = ctx.sql(sql).await?;
+ let (state, plan) = plan.into_parts();
+
+ if debug {
+ println!("=== Logical plan ===\n{plan}\n");
+ }
+
+ let plan = state.optimize(&plan)?;
+ if debug {
+ println!("=== Optimized logical plan ===\n{plan}\n");
+ }
+ let physical_plan = state.create_physical_plan(&plan).await?;
+ if debug {
+ println!(
+ "=== Physical plan ===\n{}\n",
+ displayable(physical_plan.as_ref()).indent(true)
+ );
+ }
+
+ let mut row_count = 0;
+
+ let mut stream = execute_stream(physical_plan.clone(), state.task_ctx())?;
+ while let Some(batch) = stream.next().await {
+ row_count += batch.unwrap().num_rows();
+ }
+
+ if debug {
+ println!(
+ "=== Physical plan with metrics ===\n{}\n",
+ DisplayableExecutionPlan::with_metrics(physical_plan.as_ref())
+ .indent(true)
+ );
+ }
+
+ Ok(row_count)
+ }
+
+ async fn get_table(
+ &self,
+ ctx: &SessionContext,
+ table: &str,
+ ) -> Result> {
+ let path = self.path.to_str().unwrap();
+
+ // Obtain a snapshot of the SessionState
+ let state = ctx.state();
+ let path = format!("{path}/{table}");
+ let format = Arc::new(
+ ParquetFormat::default()
+ .with_options(ctx.state().table_options().parquet.clone()),
+ );
+ let extension = DEFAULT_PARQUET_EXTENSION;
+
+ let options = ListingOptions::new(format)
+ .with_file_extension(extension)
+ .with_collect_stat(state.config().collect_statistics());
+
+ let table_path = ListingTableUrl::parse(path)?;
+ let config = ListingTableConfig::new(table_path).with_listing_options(options);
+ let config = config.infer_schema(&state).await?;
+
+ Ok(Arc::new(ListingTable::try_new(config)?))
+ }
+
+ fn iterations(&self) -> usize {
+ self.common.iterations
+ }
+
+ fn partitions(&self) -> usize {
+ self.common
+ .partitions
+ .unwrap_or(get_available_parallelism())
+ }
+}
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
index 1a1f51f700651..de3ee3d67db27 100644
--- a/benchmarks/src/tpch/run.rs
+++ b/benchmarks/src/tpch/run.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
use super::{
get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
};
-use crate::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt};
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::{self, pretty_format_batches};
@@ -37,6 +37,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion::prelude::*;
use datafusion_common::instant::Instant;
+use datafusion_common::utils::get_available_parallelism;
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
use log::info;
@@ -120,11 +121,6 @@ impl RunOpt {
.config()
.with_collect_statistics(!self.disable_statistics);
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
- config
- .options_mut()
- .execution
- .parquet
- .schema_force_view_types = self.common.force_view_types;
let ctx = SessionContext::new_with_config(config);
// register tables
@@ -301,7 +297,9 @@ impl RunOpt {
}
fn partitions(&self) -> usize {
- self.common.partitions.unwrap_or(num_cpus::get())
+ self.common
+ .partitions
+ .unwrap_or(get_available_parallelism())
}
}
@@ -345,7 +343,6 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
- force_view_types: false,
};
let opt = RunOpt {
query: Some(query),
@@ -379,7 +376,6 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
- force_view_types: false,
};
let opt = RunOpt {
query: Some(query),
diff --git a/benchmarks/src/util/options.rs b/benchmarks/src/util/options.rs
index efdb074b2461e..b1570a1d1bc14 100644
--- a/benchmarks/src/util/options.rs
+++ b/benchmarks/src/util/options.rs
@@ -16,6 +16,7 @@
// under the License.
use datafusion::prelude::SessionConfig;
+use datafusion_common::utils::get_available_parallelism;
use structopt::StructOpt;
// Common benchmark options (don't use doc comments otherwise this doc
@@ -37,11 +38,6 @@ pub struct CommonOpt {
/// Activate debug mode to see more details
#[structopt(short, long)]
pub debug: bool,
-
- /// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
- /// when reading ParquetFiles
- #[structopt(long)]
- pub force_view_types: bool,
}
impl CommonOpt {
@@ -53,7 +49,9 @@ impl CommonOpt {
/// Modify the existing config appropriately
pub fn update_config(&self, config: SessionConfig) -> SessionConfig {
config
- .with_target_partitions(self.partitions.unwrap_or(num_cpus::get()))
+ .with_target_partitions(
+ self.partitions.unwrap_or(get_available_parallelism()),
+ )
.with_batch_size(self.batch_size)
}
}
diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs
index 5ee6691576b44..13969f4d39497 100644
--- a/benchmarks/src/util/run.rs
+++ b/benchmarks/src/util/run.rs
@@ -16,6 +16,7 @@
// under the License.
use datafusion::{error::Result, DATAFUSION_VERSION};
+use datafusion_common::utils::get_available_parallelism;
use serde::{Serialize, Serializer};
use serde_json::Value;
use std::{
@@ -68,7 +69,7 @@ impl RunContext {
Self {
benchmark_version: env!("CARGO_PKG_VERSION").to_owned(),
datafusion_version: DATAFUSION_VERSION.to_owned(),
- num_cpus: num_cpus::get(),
+ num_cpus: get_available_parallelism(),
start_time: SystemTime::now(),
arguments: std::env::args().skip(1).collect::>(),
}
diff --git a/ci/scripts/retry b/ci/scripts/retry
new file mode 100755
index 0000000000000..411dc532ca52f
--- /dev/null
+++ b/ci/scripts/retry
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+x() {
+ echo "+ $*" >&2
+ "$@"
+}
+
+max_retry_time_seconds=$(( 5 * 60 ))
+retry_delay_seconds=10
+
+END=$(( $(date +%s) + ${max_retry_time_seconds} ))
+
+while (( $(date +%s) < $END )); do
+ x "$@" && exit 0
+ sleep "${retry_delay_seconds}"
+done
+
+echo "$0: retrying [$*] timed out" >&2
+exit 1
diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh
index 1bb97c88106f2..c3efcf2cf2e92 100755
--- a/ci/scripts/rust_example.sh
+++ b/ci/scripts/rust_example.sh
@@ -17,9 +17,13 @@
# specific language governing permissions and limitations
# under the License.
-set -ex
+set -e
+
+export CARGO_PROFILE_CI_OPT_LEVEL="s"
+export CARGO_PROFILE_CI_STRIP=true
+
cd datafusion-examples/examples/
-cargo check --examples
+cargo build --profile ci --examples
files=$(ls .)
for filename in $files
@@ -27,7 +31,6 @@ do
example_name=`basename $filename ".rs"`
# Skip tests that rely on external storage and flight
if [ ! -d $filename ]; then
- cargo run --example $example_name
- cargo clean -p datafusion-examples
+ cargo run --profile ci --example $example_name
fi
done
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 8a6ccacbb3807..8c7f2113eedb3 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -63,9 +63,9 @@ dependencies = [
[[package]]
name = "allocator-api2"
-version = "0.2.18"
+version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "android-tzdata"
@@ -84,9 +84,9 @@ dependencies = [
[[package]]
name = "anstream"
-version = "0.6.15"
+version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
@@ -99,48 +99,49 @@ dependencies = [
[[package]]
name = "anstyle"
-version = "1.0.8"
+version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
-version = "0.2.5"
+version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
-version = "1.1.1"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
]
[[package]]
name = "anstyle-wincon"
-version = "3.0.4"
+version = "3.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
+checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
dependencies = [
"anstyle",
- "windows-sys 0.52.0",
+ "once_cell",
+ "windows-sys 0.59.0",
]
[[package]]
name = "apache-avro"
-version = "0.16.0"
+version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ceb7c683b2f8f40970b70e39ff8be514c95b96fcb9c4af87e1ed2cb2e10801a0"
+checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13"
dependencies = [
- "bzip2",
+ "bigdecimal",
+ "bzip2 0.4.4",
"crc32fast",
"digest",
- "lazy_static",
"libflate",
"log",
"num-bigint",
@@ -148,15 +149,16 @@ dependencies = [
"rand",
"regex-lite",
"serde",
+ "serde_bytes",
"serde_json",
"snap",
- "strum 0.25.0",
- "strum_macros 0.25.3",
- "thiserror",
+ "strum",
+ "strum_macros",
+ "thiserror 1.0.69",
"typed-builder",
"uuid",
"xz2",
- "zstd 0.12.4",
+ "zstd",
]
[[package]]
@@ -173,9 +175,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arrow"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c"
+checksum = "d2ccdcc8fb14508ca20aaec7076032e5c0b0751b906036d4496786e2f227a37a"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -194,24 +196,23 @@ dependencies = [
[[package]]
name = "arrow-arith"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f"
+checksum = "a1aad8e27f32e411a0fc0bf5a625a35f0bf9b9f871cf4542abe31f7cef4beea2"
dependencies = [
"arrow-array",
"arrow-buffer",
"arrow-data",
"arrow-schema",
"chrono",
- "half",
"num",
]
[[package]]
name = "arrow-array"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da"
+checksum = "bd6ed90c28c6f73a706c55799b8cc3a094e89257238e5b1d65ca7c70bd3ae23f"
dependencies = [
"ahash",
"arrow-buffer",
@@ -220,15 +221,15 @@ dependencies = [
"chrono",
"chrono-tz",
"half",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.2",
"num",
]
[[package]]
name = "arrow-buffer"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80"
+checksum = "fe4a40bdc1552ea10fbdeae4e5a945d8572c32f66bce457b96c13d9c46b80447"
dependencies = [
"bytes",
"half",
@@ -237,9 +238,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4"
+checksum = "430c0a21aa7f81bcf0f97c57216d7127795ea755f494d27bae2bd233be43c2cc"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -258,28 +259,25 @@ dependencies = [
[[package]]
name = "arrow-csv"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4"
+checksum = "b4444c8f8c57ac00e6a679ede67d1ae8872c170797dc45b46f75702437a77888"
dependencies = [
"arrow-array",
- "arrow-buffer",
"arrow-cast",
- "arrow-data",
"arrow-schema",
"chrono",
"csv",
"csv-core",
"lazy_static",
- "lexical-core",
"regex",
]
[[package]]
name = "arrow-data"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634"
+checksum = "09af476cfbe9879937e50b1334c73189de6039186e025b1b1ac84b283b87b20e"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -289,13 +287,12 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2"
+checksum = "136296e8824333a8a4c4a6e508e4aa65d5678b801246d0408825ae7b2523c628"
dependencies = [
"arrow-array",
"arrow-buffer",
- "arrow-cast",
"arrow-data",
"arrow-schema",
"flatbuffers",
@@ -304,9 +301,9 @@ dependencies = [
[[package]]
name = "arrow-json"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3"
+checksum = "e222ad0e419ab8276818c5605a5bb1e35ed86fa8c5e550726433cc63b09c3c78"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -324,26 +321,23 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f"
+checksum = "eddf14c5f03b679ec8ceac4dfac43f63cdc4ed54dab3cc120a4ef46af38481eb"
dependencies = [
"arrow-array",
"arrow-buffer",
"arrow-data",
"arrow-schema",
"arrow-select",
- "half",
- "num",
]
[[package]]
name = "arrow-row"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6"
+checksum = "e9acdc58da19f383f4ba381fa0e3583534ae2ceb31269aaf4a03f08ff13e8443"
dependencies = [
- "ahash",
"arrow-array",
"arrow-buffer",
"arrow-data",
@@ -353,15 +347,15 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794"
+checksum = "3a1822a1a952955637e85e8f9d6b0e04dd75d65492b87ec548dd593d3a1f772b"
[[package]]
name = "arrow-select"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a"
+checksum = "5c4172e9a12dfe15303d3926269f9ead471ea93bdd067d113abc65cb6c48e246"
dependencies = [
"ahash",
"arrow-array",
@@ -373,9 +367,9 @@ dependencies = [
[[package]]
name = "arrow-string"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305"
+checksum = "73683040445f4932342781926189901c9521bb1a787c35dbe628a3ce51372d3c"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -406,27 +400,26 @@ dependencies = [
[[package]]
name = "async-compression"
-version = "0.4.13"
+version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429"
+checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522"
dependencies = [
- "bzip2",
+ "bzip2 0.4.4",
"flate2",
"futures-core",
- "futures-io",
"memchr",
"pin-project-lite",
"tokio",
"xz2",
- "zstd 0.13.2",
- "zstd-safe 7.2.1",
+ "zstd",
+ "zstd-safe",
]
[[package]]
name = "async-trait"
-version = "0.1.83"
+version = "0.1.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
+checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056"
dependencies = [
"proc-macro2",
"quote",
@@ -456,9 +449,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "aws-config"
-version = "1.5.8"
+version = "1.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7198e6f03240fdceba36656d8be440297b6b82270325908c7381f37d826a74f6"
+checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -467,7 +460,7 @@ dependencies = [
"aws-sdk-sts",
"aws-smithy-async",
"aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.60.7",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -498,9 +491,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
-version = "1.4.3"
+version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468"
+checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -523,15 +516,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sso"
-version = "1.45.0"
+version = "1.50.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e33ae899566f3d395cbf42858e433930682cc9c1889fa89318896082fef45efb"
+checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.2",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -545,15 +538,15 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
-version = "1.46.0"
+version = "1.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f39c09e199ebd96b9f860b0fce4b6625f211e064ad7c8693b72ecf7ef03881e0"
+checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.2",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -567,15 +560,15 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
-version = "1.45.0"
+version = "1.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d95f93a98130389eb6233b9d615249e543f6c24a68ca1f109af9ca5164a8765"
+checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
dependencies = [
"aws-credential-types",
"aws-runtime",
"aws-smithy-async",
"aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.2",
"aws-smithy-query",
"aws-smithy-runtime",
"aws-smithy-runtime-api",
@@ -590,9 +583,9 @@ dependencies = [
[[package]]
name = "aws-sigv4"
-version = "1.2.4"
+version = "1.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68"
+checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
dependencies = [
"aws-credential-types",
"aws-smithy-http",
@@ -603,7 +596,7 @@ dependencies = [
"hex",
"hmac",
"http 0.2.12",
- "http 1.1.0",
+ "http 1.2.0",
"once_cell",
"percent-encoding",
"sha2",
@@ -613,9 +606,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
-version = "1.2.1"
+version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
+checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -624,9 +617,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
-version = "0.60.11"
+version = "0.60.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
+checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
dependencies = [
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -651,6 +644,15 @@ dependencies = [
"aws-smithy-types",
]
+[[package]]
+name = "aws-smithy-json"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
+dependencies = [
+ "aws-smithy-types",
+]
+
[[package]]
name = "aws-smithy-query"
version = "0.60.7"
@@ -663,9 +665,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
-version = "1.7.1"
+version = "1.7.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87"
+checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -678,7 +680,7 @@ dependencies = [
"http-body 0.4.6",
"http-body 1.0.1",
"httparse",
- "hyper 0.14.30",
+ "hyper 0.14.32",
"hyper-rustls 0.24.2",
"once_cell",
"pin-project-lite",
@@ -690,15 +692,15 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime-api"
-version = "1.7.2"
+version = "1.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96"
+checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd"
dependencies = [
"aws-smithy-async",
"aws-smithy-types",
"bytes",
"http 0.2.12",
- "http 1.1.0",
+ "http 1.2.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -707,16 +709,16 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
-version = "1.2.7"
+version = "1.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b"
+checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
dependencies = [
"base64-simd",
"bytes",
"bytes-utils",
"futures-core",
"http 0.2.12",
- "http 1.1.0",
+ "http 1.2.0",
"http-body 0.4.6",
"http-body 1.0.1",
"http-body-util",
@@ -742,9 +744,9 @@ dependencies = [
[[package]]
name = "aws-types"
-version = "1.3.3"
+version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
+checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
dependencies = [
"aws-credential-types",
"aws-smithy-async",
@@ -791,6 +793,20 @@ dependencies = [
"vsimd",
]
+[[package]]
+name = "bigdecimal"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f31f3af01c5c65a07985c804d3366560e6fa7883d640a122819b14ec327482c"
+dependencies = [
+ "autocfg",
+ "libm",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+ "serde",
+]
+
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -799,9 +815,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
-version = "2.6.0"
+version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "blake2"
@@ -814,9 +830,9 @@ dependencies = [
[[package]]
name = "blake3"
-version = "1.5.4"
+version = "1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7"
+checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e"
dependencies = [
"arrayref",
"arrayvec",
@@ -836,9 +852,9 @@ dependencies = [
[[package]]
name = "brotli"
-version = "6.0.0"
+version = "7.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b"
+checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
@@ -857,9 +873,9 @@ dependencies = [
[[package]]
name = "bstr"
-version = "1.10.0"
+version = "1.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
+checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0"
dependencies = [
"memchr",
"regex-automata",
@@ -880,9 +896,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
-version = "1.7.2"
+version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
[[package]]
name = "bytes-utils"
@@ -904,6 +920,16 @@ dependencies = [
"libc",
]
+[[package]]
+name = "bzip2"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58"
+dependencies = [
+ "bzip2-sys",
+ "libc",
+]
+
[[package]]
name = "bzip2-sys"
version = "0.1.11+1.0.8"
@@ -917,9 +943,9 @@ dependencies = [
[[package]]
name = "cc"
-version = "1.1.28"
+version = "1.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1"
+checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b"
dependencies = [
"jobserver",
"libc",
@@ -938,11 +964,17 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
[[package]]
name = "chrono"
-version = "0.4.38"
+version = "0.4.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825"
dependencies = [
"android-tzdata",
"iana-time-zone",
@@ -974,9 +1006,9 @@ dependencies = [
[[package]]
name = "clap"
-version = "4.5.19"
+version = "4.5.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615"
+checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
dependencies = [
"clap_builder",
"clap_derive",
@@ -984,9 +1016,9 @@ dependencies = [
[[package]]
name = "clap_builder"
-version = "4.5.19"
+version = "4.5.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b"
+checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
dependencies = [
"anstream",
"anstyle",
@@ -996,11 +1028,11 @@ dependencies = [
[[package]]
name = "clap_derive"
-version = "4.5.18"
+version = "4.5.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
+checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
dependencies = [
- "heck 0.5.0",
+ "heck",
"proc-macro2",
"quote",
"syn",
@@ -1008,9 +1040,9 @@ dependencies = [
[[package]]
name = "clap_lex"
-version = "0.7.2"
+version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "clipboard-win"
@@ -1023,19 +1055,19 @@ dependencies = [
[[package]]
name = "colorchoice"
-version = "1.0.2"
+version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "comfy-table"
-version = "7.1.1"
+version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
+checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9"
dependencies = [
- "strum 0.26.3",
- "strum_macros 0.26.4",
- "unicode-width",
+ "strum",
+ "strum_macros",
+ "unicode-width 0.2.0",
]
[[package]]
@@ -1074,6 +1106,16 @@ dependencies = [
"libc",
]
+[[package]]
+name = "core-foundation"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
@@ -1091,9 +1133,9 @@ dependencies = [
[[package]]
name = "cpufeatures"
-version = "0.2.14"
+version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0"
+checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3"
dependencies = [
"libc",
]
@@ -1109,9 +1151,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
-version = "0.8.20"
+version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
@@ -1131,9 +1173,9 @@ dependencies = [
[[package]]
name = "csv"
-version = "1.3.0"
+version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
dependencies = [
"csv-core",
"itoa",
@@ -1152,9 +1194,9 @@ dependencies = [
[[package]]
name = "ctor"
-version = "0.2.8"
+version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f"
+checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501"
dependencies = [
"quote",
"syn",
@@ -1162,9 +1204,9 @@ dependencies = [
[[package]]
name = "dary_heap"
-version = "0.3.6"
+version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca"
+checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
[[package]]
name = "dashmap"
@@ -1182,9 +1224,8 @@ dependencies = [
[[package]]
name = "datafusion"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
- "ahash",
"apache-avro",
"arrow",
"arrow-array",
@@ -1193,7 +1234,7 @@ dependencies = [
"async-compression",
"async-trait",
"bytes",
- "bzip2",
+ "bzip2 0.5.0",
"chrono",
"dashmap",
"datafusion-catalog",
@@ -1204,6 +1245,7 @@ dependencies = [
"datafusion-functions",
"datafusion-functions-aggregate",
"datafusion-functions-nested",
+ "datafusion-functions-table",
"datafusion-functions-window",
"datafusion-optimizer",
"datafusion-physical-expr",
@@ -1214,19 +1256,14 @@ dependencies = [
"flate2",
"futures",
"glob",
- "half",
- "hashbrown 0.14.5",
- "indexmap",
- "itertools",
+ "itertools 0.14.0",
"log",
"num-traits",
- "num_cpus",
"object_store",
"parking_lot",
"parquet",
- "paste",
- "pin-project-lite",
"rand",
+ "regex",
"sqlparser",
"tempfile",
"tokio",
@@ -1234,12 +1271,12 @@ dependencies = [
"url",
"uuid",
"xz2",
- "zstd 0.13.2",
+ "zstd",
]
[[package]]
name = "datafusion-catalog"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow-schema",
"async-trait",
@@ -1252,7 +1289,7 @@ dependencies = [
[[package]]
name = "datafusion-cli"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
"assert_cmd",
@@ -1265,9 +1302,11 @@ dependencies = [
"clap",
"ctor",
"datafusion",
+ "datafusion-catalog",
"dirs",
"env_logger",
"futures",
+ "home",
"mimalloc",
"object_store",
"parking_lot",
@@ -1282,46 +1321,51 @@ dependencies = [
[[package]]
name = "datafusion-common"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"apache-avro",
"arrow",
"arrow-array",
"arrow-buffer",
+ "arrow-ipc",
"arrow-schema",
- "chrono",
+ "base64 0.22.1",
"half",
"hashbrown 0.14.5",
- "instant",
+ "indexmap",
"libc",
- "num_cpus",
+ "log",
"object_store",
"parquet",
"paste",
+ "recursive",
"sqlparser",
"tokio",
+ "web-time",
]
[[package]]
name = "datafusion-common-runtime"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"log",
"tokio",
]
+[[package]]
+name = "datafusion-doc"
+version = "44.0.0"
+
[[package]]
name = "datafusion-execution"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
- "chrono",
"dashmap",
"datafusion-common",
"datafusion-expr",
"futures",
- "hashbrown 0.14.5",
"log",
"object_store",
"parking_lot",
@@ -1332,38 +1376,35 @@ dependencies = [
[[package]]
name = "datafusion-expr"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
- "ahash",
"arrow",
- "arrow-array",
- "arrow-buffer",
"chrono",
"datafusion-common",
+ "datafusion-doc",
"datafusion-expr-common",
"datafusion-functions-aggregate-common",
"datafusion-functions-window-common",
"datafusion-physical-expr-common",
"indexmap",
"paste",
+ "recursive",
"serde_json",
"sqlparser",
- "strum 0.26.3",
- "strum_macros 0.26.4",
]
[[package]]
name = "datafusion-expr-common"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
"datafusion-common",
- "paste",
+ "itertools 0.14.0",
]
[[package]]
name = "datafusion-functions"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
"arrow-buffer",
@@ -1372,11 +1413,14 @@ dependencies = [
"blake3",
"chrono",
"datafusion-common",
+ "datafusion-doc",
"datafusion-execution",
"datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-macros",
"hashbrown 0.14.5",
"hex",
- "itertools",
+ "itertools 0.14.0",
"log",
"md-5",
"rand",
@@ -1388,38 +1432,38 @@ dependencies = [
[[package]]
name = "datafusion-functions-aggregate"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"arrow",
"arrow-schema",
"datafusion-common",
+ "datafusion-doc",
"datafusion-execution",
"datafusion-expr",
"datafusion-functions-aggregate-common",
+ "datafusion-macros",
"datafusion-physical-expr",
"datafusion-physical-expr-common",
"half",
- "indexmap",
"log",
"paste",
]
[[package]]
name = "datafusion-functions-aggregate-common"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"arrow",
"datafusion-common",
"datafusion-expr-common",
"datafusion-physical-expr-common",
- "rand",
]
[[package]]
name = "datafusion-functions-nested"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
"arrow-array",
@@ -1427,24 +1471,42 @@ dependencies = [
"arrow-ord",
"arrow-schema",
"datafusion-common",
+ "datafusion-doc",
"datafusion-execution",
"datafusion-expr",
"datafusion-functions",
"datafusion-functions-aggregate",
+ "datafusion-macros",
"datafusion-physical-expr-common",
- "itertools",
+ "itertools 0.14.0",
"log",
"paste",
- "rand",
+]
+
+[[package]]
+name = "datafusion-functions-table"
+version = "44.0.0"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-physical-plan",
+ "parking_lot",
+ "paste",
]
[[package]]
name = "datafusion-functions-window"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"datafusion-common",
+ "datafusion-doc",
"datafusion-expr",
"datafusion-functions-window-common",
+ "datafusion-macros",
+ "datafusion-physical-expr",
"datafusion-physical-expr-common",
"log",
"paste",
@@ -1452,86 +1514,95 @@ dependencies = [
[[package]]
name = "datafusion-functions-window-common"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"datafusion-common",
+ "datafusion-physical-expr-common",
+]
+
+[[package]]
+name = "datafusion-macros"
+version = "44.0.0"
+dependencies = [
+ "datafusion-expr",
+ "quote",
+ "syn",
]
[[package]]
name = "datafusion-optimizer"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
- "async-trait",
"chrono",
"datafusion-common",
"datafusion-expr",
"datafusion-physical-expr",
- "hashbrown 0.14.5",
"indexmap",
- "itertools",
+ "itertools 0.14.0",
"log",
- "paste",
+ "recursive",
+ "regex",
"regex-syntax",
]
[[package]]
name = "datafusion-physical-expr"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"arrow",
"arrow-array",
"arrow-buffer",
- "arrow-ord",
"arrow-schema",
- "arrow-string",
- "base64 0.22.1",
- "chrono",
"datafusion-common",
- "datafusion-execution",
"datafusion-expr",
"datafusion-expr-common",
"datafusion-functions-aggregate-common",
"datafusion-physical-expr-common",
"half",
"hashbrown 0.14.5",
- "hex",
"indexmap",
- "itertools",
+ "itertools 0.14.0",
"log",
"paste",
"petgraph",
- "regex",
]
[[package]]
name = "datafusion-physical-expr-common"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"arrow",
"datafusion-common",
"datafusion-expr-common",
"hashbrown 0.14.5",
- "rand",
+ "itertools 0.14.0",
]
[[package]]
name = "datafusion-physical-optimizer"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
+ "arrow",
"arrow-schema",
"datafusion-common",
"datafusion-execution",
+ "datafusion-expr",
+ "datafusion-expr-common",
"datafusion-physical-expr",
+ "datafusion-physical-expr-common",
"datafusion-physical-plan",
- "itertools",
+ "futures",
+ "itertools 0.14.0",
+ "log",
+ "recursive",
]
[[package]]
name = "datafusion-physical-plan"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"ahash",
"arrow",
@@ -1545,8 +1616,6 @@ dependencies = [
"datafusion-common-runtime",
"datafusion-execution",
"datafusion-expr",
- "datafusion-functions-aggregate",
- "datafusion-functions-aggregate-common",
"datafusion-functions-window-common",
"datafusion-physical-expr",
"datafusion-physical-expr-common",
@@ -1554,28 +1623,28 @@ dependencies = [
"half",
"hashbrown 0.14.5",
"indexmap",
- "itertools",
+ "itertools 0.14.0",
"log",
- "once_cell",
"parking_lot",
"pin-project-lite",
- "rand",
"tokio",
]
[[package]]
name = "datafusion-sql"
-version = "42.0.0"
+version = "44.0.0"
dependencies = [
"arrow",
"arrow-array",
"arrow-schema",
+ "bigdecimal",
"datafusion-common",
"datafusion-expr",
+ "indexmap",
"log",
+ "recursive",
"regex",
"sqlparser",
- "strum 0.26.3",
]
[[package]]
@@ -1625,6 +1694,17 @@ dependencies = [
"windows-sys 0.48.0",
]
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "doc-comment"
version = "0.3.3"
@@ -1645,9 +1725,9 @@ checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
[[package]]
name = "env_filter"
-version = "0.1.2"
+version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
dependencies = [
"log",
"regex",
@@ -1655,9 +1735,9 @@ dependencies = [
[[package]]
name = "env_logger"
-version = "0.11.5"
+version = "0.11.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
+checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0"
dependencies = [
"anstream",
"anstyle",
@@ -1674,12 +1754,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
-version = "0.3.9"
+version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
dependencies = [
"libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
]
[[package]]
@@ -1690,9 +1770,9 @@ checksum = "a5d9305ccc6942a704f4335694ecd3de2ea531b114ac2d51f5f843750787a92f"
[[package]]
name = "fastrand"
-version = "2.1.1"
+version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "fd-lock"
@@ -1707,15 +1787,15 @@ dependencies = [
[[package]]
name = "fixedbitset"
-version = "0.4.2"
+version = "0.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
[[package]]
name = "flatbuffers"
-version = "24.3.25"
+version = "24.12.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f"
+checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096"
dependencies = [
"bitflags 1.3.2",
"rustc_version",
@@ -1723,9 +1803,9 @@ dependencies = [
[[package]]
name = "flate2"
-version = "1.0.34"
+version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
+checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
dependencies = [
"crc32fast",
"miniz_oxide",
@@ -1733,9 +1813,9 @@ dependencies = [
[[package]]
name = "float-cmp"
-version = "0.9.0"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
+checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8"
dependencies = [
"num-traits",
]
@@ -1867,8 +1947,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if",
+ "js-sys",
"libc",
"wasi",
+ "wasm-bindgen",
]
[[package]]
@@ -1879,9 +1961,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "glob"
-version = "0.3.1"
+version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
[[package]]
name = "h2"
@@ -1904,16 +1986,16 @@ dependencies = [
[[package]]
name = "h2"
-version = "0.4.6"
+version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205"
+checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
- "http 1.1.0",
+ "http 1.2.0",
"indexmap",
"slab",
"tokio",
@@ -1944,15 +2026,9 @@ dependencies = [
[[package]]
name = "hashbrown"
-version = "0.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
-
-[[package]]
-name = "heck"
-version = "0.4.1"
+version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
[[package]]
name = "heck"
@@ -1960,12 +2036,6 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-[[package]]
-name = "hermit-abi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
-
[[package]]
name = "hex"
version = "0.4.3"
@@ -2003,9 +2073,9 @@ dependencies = [
[[package]]
name = "http"
-version = "1.1.0"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea"
dependencies = [
"bytes",
"fnv",
@@ -2030,7 +2100,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
- "http 1.1.0",
+ "http 1.2.0",
]
[[package]]
@@ -2041,7 +2111,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
dependencies = [
"bytes",
"futures-util",
- "http 1.1.0",
+ "http 1.2.0",
"http-body 1.0.1",
"pin-project-lite",
]
@@ -2066,9 +2136,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
-version = "0.14.30"
+version = "0.14.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
+checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
dependencies = [
"bytes",
"futures-channel",
@@ -2090,15 +2160,15 @@ dependencies = [
[[package]]
name = "hyper"
-version = "1.4.1"
+version = "1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
+checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
- "h2 0.4.6",
- "http 1.1.0",
+ "h2 0.4.7",
+ "http 1.2.0",
"http-body 1.0.1",
"httparse",
"itoa",
@@ -2116,7 +2186,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
dependencies = [
"futures-util",
"http 0.2.12",
- "hyper 0.14.30",
+ "hyper 0.14.32",
"log",
"rustls 0.21.12",
"rustls-native-certs 0.6.3",
@@ -2126,34 +2196,34 @@ dependencies = [
[[package]]
name = "hyper-rustls"
-version = "0.27.3"
+version = "0.27.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333"
+checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
dependencies = [
"futures-util",
- "http 1.1.0",
- "hyper 1.4.1",
+ "http 1.2.0",
+ "hyper 1.5.2",
"hyper-util",
- "rustls 0.23.14",
- "rustls-native-certs 0.8.0",
+ "rustls 0.23.21",
+ "rustls-native-certs 0.8.1",
"rustls-pki-types",
"tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.1",
"tower-service",
]
[[package]]
name = "hyper-util"
-version = "0.1.9"
+version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b"
+checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4"
dependencies = [
"bytes",
"futures-channel",
"futures-util",
- "http 1.1.0",
+ "http 1.2.0",
"http-body 1.0.1",
- "hyper 1.4.1",
+ "hyper 1.5.2",
"pin-project-lite",
"socket2",
"tokio",
@@ -2184,36 +2254,153 @@ dependencies = [
"cc",
]
+[[package]]
+name = "icu_collections"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_locid_transform_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
+
+[[package]]
+name = "icu_normalizer"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "utf16_iter",
+ "utf8_iter",
+ "write16",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
+
+[[package]]
+name = "icu_properties"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_locid_transform",
+ "icu_properties_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
+
+[[package]]
+name = "icu_provider"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_provider_macros",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_provider_macros"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "idna"
-version = "0.5.0"
+version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
dependencies = [
- "unicode-bidi",
- "unicode-normalization",
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
]
[[package]]
-name = "indexmap"
-version = "2.6.0"
+name = "idna_adapter"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
dependencies = [
- "equivalent",
- "hashbrown 0.15.0",
+ "icu_normalizer",
+ "icu_properties",
]
[[package]]
-name = "instant"
-version = "0.1.13"
+name = "indexmap"
+version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
+checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
dependencies = [
- "cfg-if",
- "js-sys",
- "wasm-bindgen",
- "web-sys",
+ "equivalent",
+ "hashbrown 0.15.2",
]
[[package]]
@@ -2243,11 +2430,20 @@ dependencies = [
"either",
]
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
[[package]]
name = "itoa"
-version = "1.0.11"
+version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "jobserver"
@@ -2260,10 +2456,11 @@ dependencies = [
[[package]]
name = "js-sys"
-version = "0.3.70"
+version = "0.3.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a"
+checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
dependencies = [
+ "once_cell",
"wasm-bindgen",
]
@@ -2275,9 +2472,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lexical-core"
-version = "1.0.2"
+version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458"
+checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958"
dependencies = [
"lexical-parse-float",
"lexical-parse-integer",
@@ -2288,9 +2485,9 @@ dependencies = [
[[package]]
name = "lexical-parse-float"
-version = "1.0.2"
+version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0"
+checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2"
dependencies = [
"lexical-parse-integer",
"lexical-util",
@@ -2299,9 +2496,9 @@ dependencies = [
[[package]]
name = "lexical-parse-integer"
-version = "1.0.2"
+version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61"
+checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e"
dependencies = [
"lexical-util",
"static_assertions",
@@ -2309,18 +2506,18 @@ dependencies = [
[[package]]
name = "lexical-util"
-version = "1.0.3"
+version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0"
+checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3"
dependencies = [
"static_assertions",
]
[[package]]
name = "lexical-write-float"
-version = "1.0.2"
+version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809"
+checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd"
dependencies = [
"lexical-util",
"lexical-write-integer",
@@ -2329,9 +2526,9 @@ dependencies = [
[[package]]
name = "lexical-write-integer"
-version = "1.0.2"
+version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162"
+checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978"
dependencies = [
"lexical-util",
"static_assertions",
@@ -2339,9 +2536,9 @@ dependencies = [
[[package]]
name = "libc"
-version = "0.2.159"
+version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "libflate"
@@ -2369,9 +2566,9 @@ dependencies = [
[[package]]
name = "libm"
-version = "0.2.8"
+version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
[[package]]
name = "libmimalloc-sys"
@@ -2389,15 +2586,21 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
"libc",
]
[[package]]
name = "linux-raw-sys"
-version = "0.4.14"
+version = "0.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
+[[package]]
+name = "litemap"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
[[package]]
name = "lock_api"
@@ -2411,9 +2614,9 @@ dependencies = [
[[package]]
name = "log"
-version = "0.4.22"
+version = "0.4.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
[[package]]
name = "lz4_flex"
@@ -2468,20 +2671,19 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "miniz_oxide"
-version = "0.8.0"
+version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
+checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924"
dependencies = [
"adler2",
]
[[package]]
name = "mio"
-version = "1.0.2"
+version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
dependencies = [
- "hermit-abi",
"libc",
"wasi",
"windows-sys 0.52.0",
@@ -2502,9 +2704,9 @@ version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
"cfg-if",
- "cfg_aliases",
+ "cfg_aliases 0.1.1",
"libc",
]
@@ -2536,6 +2738,7 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
dependencies = [
"num-integer",
"num-traits",
+ "serde",
]
[[package]]
@@ -2594,30 +2797,20 @@ dependencies = [
"libm",
]
-[[package]]
-name = "num_cpus"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
[[package]]
name = "object"
-version = "0.36.5"
+version = "0.36.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
dependencies = [
"memchr",
]
[[package]]
name = "object_store"
-version = "0.11.0"
+version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45"
+checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf"
dependencies = [
"async-trait",
"base64 0.22.1",
@@ -2625,8 +2818,8 @@ dependencies = [
"chrono",
"futures",
"humantime",
- "hyper 1.4.1",
- "itertools",
+ "hyper 1.5.2",
+ "itertools 0.13.0",
"md-5",
"parking_lot",
"percent-encoding",
@@ -2673,9 +2866,9 @@ dependencies = [
[[package]]
name = "outref"
-version = "0.5.1"
+version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
+checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
[[package]]
name = "parking_lot"
@@ -2702,9 +2895,9 @@ dependencies = [
[[package]]
name = "parquet"
-version = "53.1.0"
+version = "54.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e"
+checksum = "3334c50239d9f4951653d84fa6f636da86f53742e5e5849a30fbe852f3ff4383"
dependencies = [
"ahash",
"arrow-array",
@@ -2721,7 +2914,7 @@ dependencies = [
"flate2",
"futures",
"half",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.2",
"lz4_flex",
"num",
"num-bigint",
@@ -2732,7 +2925,7 @@ dependencies = [
"thrift",
"tokio",
"twox-hash",
- "zstd 0.13.2",
+ "zstd",
"zstd-sys",
]
@@ -2759,9 +2952,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "petgraph"
-version = "0.6.5"
+version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
dependencies = [
"fixedbitset",
"indexmap",
@@ -2769,18 +2962,18 @@ dependencies = [
[[package]]
name = "phf"
-version = "0.11.2"
+version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
-version = "0.11.2"
+version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
@@ -2788,9 +2981,9 @@ dependencies = [
[[package]]
name = "phf_generator"
-version = "0.11.2"
+version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand",
@@ -2798,18 +2991,18 @@ dependencies = [
[[package]]
name = "phf_shared"
-version = "0.11.2"
+version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
-version = "0.2.14"
+version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
name = "pin-utils"
@@ -2840,9 +3033,9 @@ dependencies = [
[[package]]
name = "predicates"
-version = "3.1.2"
+version = "3.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e9086cc7640c29a356d1a29fd134380bee9d8f79a17410aa76e7ad295f42c97"
+checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573"
dependencies = [
"anstyle",
"difflib",
@@ -2854,15 +3047,15 @@ dependencies = [
[[package]]
name = "predicates-core"
-version = "1.0.8"
+version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae8177bee8e75d6846599c6b9ff679ed51e882816914eec639944d7c9aa11931"
+checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa"
[[package]]
name = "predicates-tree"
-version = "1.0.11"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41b740d195ed3166cd147c8047ec98db0e22ec019eb8eeb76d343b795304fb13"
+checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c"
dependencies = [
"predicates-core",
"termtree",
@@ -2879,24 +3072,33 @@ dependencies = [
[[package]]
name = "proc-macro2"
-version = "1.0.86"
+version = "1.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
dependencies = [
"unicode-ident",
]
+[[package]]
+name = "psm"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810"
+dependencies = [
+ "cc",
+]
+
[[package]]
name = "quad-rand"
-version = "0.2.2"
+version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db"
+checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40"
[[package]]
name = "quick-xml"
-version = "0.36.2"
+version = "0.37.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe"
+checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003"
dependencies = [
"memchr",
"serde",
@@ -2904,45 +3106,49 @@ dependencies = [
[[package]]
name = "quinn"
-version = "0.11.5"
+version = "0.11.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684"
+checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef"
dependencies = [
"bytes",
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash",
- "rustls 0.23.14",
+ "rustls 0.23.21",
"socket2",
- "thiserror",
+ "thiserror 2.0.11",
"tokio",
"tracing",
]
[[package]]
name = "quinn-proto"
-version = "0.11.8"
+version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6"
+checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d"
dependencies = [
"bytes",
+ "getrandom",
"rand",
"ring",
"rustc-hash",
- "rustls 0.23.14",
+ "rustls 0.23.21",
+ "rustls-pki-types",
"slab",
- "thiserror",
+ "thiserror 2.0.11",
"tinyvec",
"tracing",
+ "web-time",
]
[[package]]
name = "quinn-udp"
-version = "0.5.5"
+version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b"
+checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904"
dependencies = [
+ "cfg_aliases 0.2.1",
"libc",
"once_cell",
"socket2",
@@ -2952,9 +3158,9 @@ dependencies = [
[[package]]
name = "quote"
-version = "1.0.37"
+version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
"proc-macro2",
]
@@ -2999,13 +3205,33 @@ dependencies = [
"getrandom",
]
+[[package]]
+name = "recursive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
+dependencies = [
+ "recursive-proc-macro-impl",
+ "stacker",
+]
+
+[[package]]
+name = "recursive-proc-macro-impl"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
+dependencies = [
+ "quote",
+ "syn",
+]
+
[[package]]
name = "redox_syscall"
-version = "0.5.7"
+version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f"
+checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
]
[[package]]
@@ -3016,14 +3242,14 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [
"getrandom",
"libredox",
- "thiserror",
+ "thiserror 1.0.69",
]
[[package]]
name = "regex"
-version = "1.11.0"
+version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
@@ -3033,9 +3259,9 @@ dependencies = [
[[package]]
name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
@@ -3062,20 +3288,20 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
[[package]]
name = "reqwest"
-version = "0.12.8"
+version = "0.12.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b"
+checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-core",
"futures-util",
- "h2 0.4.6",
- "http 1.1.0",
+ "h2 0.4.7",
+ "http 1.2.0",
"http-body 1.0.1",
"http-body-util",
- "hyper 1.4.1",
- "hyper-rustls 0.27.3",
+ "hyper 1.5.2",
+ "hyper-rustls 0.27.5",
"hyper-util",
"ipnet",
"js-sys",
@@ -3085,8 +3311,8 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"quinn",
- "rustls 0.23.14",
- "rustls-native-certs 0.8.0",
+ "rustls 0.23.21",
+ "rustls-native-certs 0.8.1",
"rustls-pemfile 2.2.0",
"rustls-pki-types",
"serde",
@@ -3094,8 +3320,9 @@ dependencies = [
"serde_urlencoded",
"sync_wrapper",
"tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.1",
"tokio-util",
+ "tower",
"tower-service",
"url",
"wasm-bindgen",
@@ -3164,9 +3391,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
-version = "2.0.0"
+version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
+checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
[[package]]
name = "rustc_version"
@@ -3179,15 +3406,15 @@ dependencies = [
[[package]]
name = "rustix"
-version = "0.38.37"
+version = "0.38.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
+checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
"errno",
"libc",
"linux-raw-sys",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
]
[[package]]
@@ -3204,9 +3431,9 @@ dependencies = [
[[package]]
name = "rustls"
-version = "0.23.14"
+version = "0.23.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8"
+checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8"
dependencies = [
"once_cell",
"ring",
@@ -3225,20 +3452,19 @@ dependencies = [
"openssl-probe",
"rustls-pemfile 1.0.4",
"schannel",
- "security-framework",
+ "security-framework 2.11.1",
]
[[package]]
name = "rustls-native-certs"
-version = "0.8.0"
+version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a"
+checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3"
dependencies = [
"openssl-probe",
- "rustls-pemfile 2.2.0",
"rustls-pki-types",
"schannel",
- "security-framework",
+ "security-framework 3.2.0",
]
[[package]]
@@ -3261,9 +3487,12 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
-version = "1.9.0"
+version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55"
+checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37"
+dependencies = [
+ "web-time",
+]
[[package]]
name = "rustls-webpki"
@@ -3288,9 +3517,9 @@ dependencies = [
[[package]]
name = "rustversion"
-version = "1.0.17"
+version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4"
[[package]]
name = "rustyline"
@@ -3298,7 +3527,7 @@ version = "14.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63"
dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
"cfg-if",
"clipboard-win",
"fd-lock",
@@ -3309,7 +3538,7 @@ dependencies = [
"nix",
"radix_trie",
"unicode-segmentation",
- "unicode-width",
+ "unicode-width 0.1.14",
"utf8parse",
"windows-sys 0.52.0",
]
@@ -3331,9 +3560,9 @@ dependencies = [
[[package]]
name = "schannel"
-version = "0.1.26"
+version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1"
+checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
dependencies = [
"windows-sys 0.59.0",
]
@@ -3360,8 +3589,21 @@ version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
dependencies = [
- "bitflags 2.6.0",
- "core-foundation",
+ "bitflags 2.8.0",
+ "core-foundation 0.9.4",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316"
+dependencies = [
+ "bitflags 2.8.0",
+ "core-foundation 0.10.0",
"core-foundation-sys",
"libc",
"security-framework-sys",
@@ -3369,9 +3611,9 @@ dependencies = [
[[package]]
name = "security-framework-sys"
-version = "2.12.0"
+version = "2.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6"
+checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
dependencies = [
"core-foundation-sys",
"libc",
@@ -3379,9 +3621,9 @@ dependencies = [
[[package]]
name = "semver"
-version = "1.0.23"
+version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
+checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba"
[[package]]
name = "seq-macro"
@@ -3391,18 +3633,27 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
[[package]]
name = "serde"
-version = "1.0.210"
+version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
+checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
dependencies = [
"serde_derive",
]
+[[package]]
+name = "serde_bytes"
+version = "0.11.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "serde_derive"
-version = "1.0.210"
+version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
+checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
dependencies = [
"proc-macro2",
"quote",
@@ -3411,9 +3662,9 @@ dependencies = [
[[package]]
name = "serde_json"
-version = "1.0.128"
+version = "1.0.135"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
+checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9"
dependencies = [
"itoa",
"memchr",
@@ -3461,9 +3712,9 @@ dependencies = [
[[package]]
name = "siphasher"
-version = "0.3.11"
+version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
+checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "slab"
@@ -3495,7 +3746,7 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917"
dependencies = [
- "heck 0.5.0",
+ "heck",
"proc-macro2",
"quote",
"syn",
@@ -3509,9 +3760,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
[[package]]
name = "socket2"
-version = "0.5.7"
+version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8"
dependencies = [
"libc",
"windows-sys 0.52.0",
@@ -3525,9 +3776,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "sqlparser"
-version = "0.51.0"
+version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7"
+checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8"
dependencies = [
"log",
"sqlparser_derive",
@@ -3535,15 +3786,34 @@ dependencies = [
[[package]]
name = "sqlparser_derive"
-version = "0.2.2"
+version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554"
+checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
+[[package]]
+name = "stacker"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "psm",
+ "windows-sys 0.59.0",
+]
+
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -3556,33 +3826,11 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
-[[package]]
-name = "strum"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
-
[[package]]
name = "strum"
version = "0.26.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
-dependencies = [
- "strum_macros 0.26.4",
-]
-
-[[package]]
-name = "strum_macros"
-version = "0.25.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
-dependencies = [
- "heck 0.4.1",
- "proc-macro2",
- "quote",
- "rustversion",
- "syn",
-]
[[package]]
name = "strum_macros"
@@ -3590,7 +3838,7 @@ version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
dependencies = [
- "heck 0.5.0",
+ "heck",
"proc-macro2",
"quote",
"rustversion",
@@ -3605,9 +3853,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
-version = "2.0.79"
+version = "2.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
+checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
dependencies = [
"proc-macro2",
"quote",
@@ -3616,21 +3864,33 @@ dependencies = [
[[package]]
name = "sync_wrapper"
-version = "1.0.1"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
dependencies = [
"futures-core",
]
+[[package]]
+name = "synstructure"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "tempfile"
-version = "3.13.0"
+version = "3.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b"
+checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
dependencies = [
"cfg-if",
"fastrand",
+ "getrandom",
"once_cell",
"rustix",
"windows-sys 0.59.0",
@@ -3638,24 +3898,44 @@ dependencies = [
[[package]]
name = "termtree"
-version = "0.4.1"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
[[package]]
name = "thiserror"
-version = "1.0.64"
+version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
+checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [
- "thiserror-impl",
+ "thiserror-impl 2.0.11",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.64"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
dependencies = [
"proc-macro2",
"quote",
@@ -3675,9 +3955,9 @@ dependencies = [
[[package]]
name = "time"
-version = "0.3.36"
+version = "0.3.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
dependencies = [
"deranged",
"num-conv",
@@ -3695,9 +3975,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
-version = "0.2.18"
+version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
dependencies = [
"num-conv",
"time-core",
@@ -3712,11 +3992,21 @@ dependencies = [
"crunchy",
]
+[[package]]
+name = "tinystr"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
[[package]]
name = "tinyvec"
-version = "1.8.0"
+version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8"
dependencies = [
"tinyvec_macros",
]
@@ -3729,9 +4019,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
-version = "1.40.0"
+version = "1.43.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
dependencies = [
"backtrace",
"bytes",
@@ -3747,9 +4037,9 @@ dependencies = [
[[package]]
name = "tokio-macros"
-version = "2.4.0"
+version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
dependencies = [
"proc-macro2",
"quote",
@@ -3768,20 +4058,19 @@ dependencies = [
[[package]]
name = "tokio-rustls"
-version = "0.26.0"
+version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37"
dependencies = [
- "rustls 0.23.14",
- "rustls-pki-types",
+ "rustls 0.23.21",
"tokio",
]
[[package]]
name = "tokio-util"
-version = "0.7.12"
+version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a"
+checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078"
dependencies = [
"bytes",
"futures-core",
@@ -3807,6 +4096,27 @@ dependencies = [
"winnow",
]
+[[package]]
+name = "tower"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
[[package]]
name = "tower-service"
version = "0.3.3"
@@ -3815,9 +4125,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
dependencies = [
"pin-project-lite",
"tracing-attributes",
@@ -3826,9 +4136,9 @@ dependencies = [
[[package]]
name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
@@ -3837,9 +4147,9 @@ dependencies = [
[[package]]
name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
dependencies = [
"once_cell",
]
@@ -3862,18 +4172,18 @@ dependencies = [
[[package]]
name = "typed-builder"
-version = "0.16.2"
+version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34085c17941e36627a879208083e25d357243812c30e7d7387c3b954f30ade16"
+checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600"
dependencies = [
"typed-builder-macro",
]
[[package]]
name = "typed-builder-macro"
-version = "0.16.2"
+version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e"
+checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8"
dependencies = [
"proc-macro2",
"quote",
@@ -3886,26 +4196,11 @@ version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
-[[package]]
-name = "unicode-bidi"
-version = "0.3.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
-
[[package]]
name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
-
-[[package]]
-name = "unicode-normalization"
-version = "0.1.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
-dependencies = [
- "tinyvec",
-]
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "unicode-segmentation"
@@ -3919,6 +4214,12 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+[[package]]
+name = "unicode-width"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
+
[[package]]
name = "untrusted"
version = "0.9.0"
@@ -3927,9 +4228,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "url"
-version = "2.5.2"
+version = "2.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
dependencies = [
"form_urlencoded",
"idna",
@@ -3942,6 +4243,18 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+[[package]]
+name = "utf16_iter"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
[[package]]
name = "utf8parse"
version = "0.2.2"
@@ -3950,9 +4263,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid"
-version = "1.10.0"
+version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
+checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4"
dependencies = [
"getrandom",
"serde",
@@ -4006,24 +4319,24 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
-version = "0.2.93"
+version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5"
+checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
dependencies = [
"cfg-if",
"once_cell",
+ "rustversion",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
-version = "0.2.93"
+version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b"
+checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
dependencies = [
"bumpalo",
"log",
- "once_cell",
"proc-macro2",
"quote",
"syn",
@@ -4032,21 +4345,22 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
-version = "0.4.43"
+version = "0.4.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed"
+checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
dependencies = [
"cfg-if",
"js-sys",
+ "once_cell",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
-version = "0.2.93"
+version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf"
+checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@@ -4054,9 +4368,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
-version = "0.2.93"
+version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
+checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
dependencies = [
"proc-macro2",
"quote",
@@ -4067,15 +4381,18 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
-version = "0.2.93"
+version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484"
+checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+dependencies = [
+ "unicode-ident",
+]
[[package]]
name = "wasm-streams"
-version = "0.4.1"
+version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
dependencies = [
"futures-util",
"js-sys",
@@ -4086,9 +4403,19 @@ dependencies = [
[[package]]
name = "web-sys"
-version = "0.3.70"
+version = "0.3.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0"
+checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
@@ -4292,13 +4619,25 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
-version = "0.6.20"
+version = "0.6.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b"
+checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a"
dependencies = [
"memchr",
]
+[[package]]
+name = "write16"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
+
+[[package]]
+name = "writeable"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
+
[[package]]
name = "xmlparser"
version = "0.13.6"
@@ -4314,6 +4653,30 @@ dependencies = [
"lzma-sys",
]
+[[package]]
+name = "yoke"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
[[package]]
name = "zerocopy"
version = "0.7.35"
@@ -4335,6 +4698,27 @@ dependencies = [
"syn",
]
+[[package]]
+name = "zerofrom"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
[[package]]
name = "zeroize"
version = "1.8.1"
@@ -4342,31 +4726,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
[[package]]
-name = "zstd"
-version = "0.12.4"
+name = "zerovec"
+version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
dependencies = [
- "zstd-safe 6.0.6",
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
]
[[package]]
-name = "zstd"
-version = "0.13.2"
+name = "zerovec-derive"
+version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
+checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
- "zstd-safe 7.2.1",
+ "proc-macro2",
+ "quote",
+ "syn",
]
[[package]]
-name = "zstd-safe"
-version = "6.0.6"
+name = "zstd"
+version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
- "libc",
- "zstd-sys",
+ "zstd-safe",
]
[[package]]
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index b86dbd2a38027..b9d190ac07cc5 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -18,51 +18,69 @@
[package]
name = "datafusion-cli"
description = "Command Line Client for DataFusion query engine."
-version = "42.0.0"
+version = "44.0.0"
authors = ["Apache DataFusion "]
edition = "2021"
keywords = ["arrow", "datafusion", "query", "sql"]
license = "Apache-2.0"
homepage = "https://datafusion.apache.org"
repository = "https://github.com/apache/datafusion"
-# Specify MSRV here as `cargo msrv` doesn't support workspace version
-rust-version = "1.78"
+rust-version = "1.80.1"
readme = "README.md"
[dependencies]
-arrow = { version = "53.0.0" }
+arrow = { version = "54.0.0" }
async-trait = "0.1.73"
-aws-config = "1.5.5"
-aws-sdk-sso = "1.43.0"
-aws-sdk-ssooidc = "1.44.0"
-aws-sdk-sts = "1.43.0"
+## 1.5.13 requires a hiher MSRV 1.81 so lock until DataFusion MSRV catches up
+aws-config = "=1.5.10"
+## 1.53.0 requires a higher MSRV 1.81 so lock until DataFusion MSRV catches up
+aws-sdk-sso = "=1.50.0"
+## 1.54.0 requires a higher MSRV 1.81 so lock until DataFusion MSRV catches up
+aws-sdk-ssooidc = "=1.51.0"
+## 1.54.1 requires a higher MSRV 1.81 so lock until DataFusion MSRV catches up
+aws-sdk-sts = "=1.51.0"
# end pin aws-sdk crates
aws-credential-types = "1.2.0"
clap = { version = "4.5.16", features = ["derive", "cargo"] }
-datafusion = { path = "../datafusion/core", version = "42.0.0", features = [
+datafusion = { path = "../datafusion/core", version = "44.0.0", features = [
"avro",
"crypto_expressions",
"datetime_expressions",
"encoding_expressions",
"parquet",
+ "recursive_protection",
"regex_expressions",
"unicode_expressions",
"compression",
] }
+datafusion-catalog = { path = "../datafusion/catalog", version = "44.0.0" }
dirs = "5.0.1"
env_logger = "0.11"
futures = "0.3"
+# pin as home 0.5.11 has MSRV 1.81. Can remove this once we bump MSRV to 1.81
+home = "=0.5.9"
mimalloc = { version = "0.1", default-features = false }
object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] }
parking_lot = { version = "0.12" }
-parquet = { version = "53.0.0", default-features = false }
+parquet = { version = "54.0.0", default-features = false }
regex = "1.8"
rustyline = "14.0"
tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] }
-url = "2.2"
+url = "2.5.4"
[dev-dependencies]
assert_cmd = "2.0"
-ctor = "0.2.0"
+ctor = "0.2.9"
predicates = "3.0"
rstest = "0.22"
+
+[profile.ci]
+inherits = "dev"
+incremental = false
+
+# ci turns off debug info, etc for dependencies to allow for smaller binaries making caching more effective
+[profile.ci.package."*"]
+debug = false
+debug-assertions = false
+strip = "debuginfo"
+incremental = false
diff --git a/datafusion-cli/Dockerfile b/datafusion-cli/Dockerfile
index 7adead64db57c..faf345660dbea 100644
--- a/datafusion-cli/Dockerfile
+++ b/datafusion-cli/Dockerfile
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-FROM rust:1.78-bookworm AS builder
+FROM rust:1.80-bookworm AS builder
COPY . /usr/src/datafusion
COPY ./datafusion /usr/src/datafusion/datafusion
diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md
index 73a2eb01b76ff..ce09c3b345b9b 100644
--- a/datafusion-cli/README.md
+++ b/datafusion-cli/README.md
@@ -41,6 +41,8 @@ The reason `datafusion-cli` is not part of the main workspace in
checked in `Cargo.lock` file to ensure reproducible builds.
However, the `datafusion` and sub crates are intended for use as libraries and
-thus do not have a `Cargo.lock` file checked in.
+thus do not have a `Cargo.lock` file checked in, as described in the [main
+README] file.
[`datafusion cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
+[main readme]: ../README.md
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index db4242d971758..a4f154b2de92d 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -33,11 +33,12 @@ use crate::{
};
use datafusion::common::instant::Instant;
-use datafusion::common::plan_datafusion_err;
+use datafusion::common::{plan_datafusion_err, plan_err};
use datafusion::config::ConfigFileType;
use datafusion::datasource::listing::ListingTableUrl;
use datafusion::error::{DataFusionError, Result};
use datafusion::logical_expr::{DdlStatement, LogicalPlan};
+use datafusion::physical_plan::execution_plan::EmissionType;
use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties};
use datafusion::sql::parser::{DFParser, Statement};
use datafusion::sql::sqlparser::dialect::dialect_from_str;
@@ -234,10 +235,19 @@ pub(super) async fn exec_and_print(
let df = ctx.execute_logical_plan(plan).await?;
let physical_plan = df.create_physical_plan().await?;
- if physical_plan.execution_mode().is_unbounded() {
+ if physical_plan.boundedness().is_unbounded() {
+ if physical_plan.pipeline_behavior() == EmissionType::Final {
+ return plan_err!(
+ "The given query can generate a valid result only once \
+ the source finishes, but the source is unbounded"
+ );
+ }
+ // As the input stream comes, we can generate results.
+ // However, memory safety is not guaranteed.
let stream = execute_stream(physical_plan, task_ctx.clone())?;
print_options.print_stream(stream, now).await?;
} else {
+ // Bounded stream; collected results are printed after all input consumed.
let schema = physical_plan.schema();
let results = collect(physical_plan, task_ctx.clone()).await?;
adjusted.into_inner().print_batches(schema, &results, now)?;
@@ -383,7 +393,7 @@ pub(crate) async fn register_object_store_and_config_extensions(
ctx.register_table_options_extension_from_scheme(scheme);
// Clone and modify the default table options based on the provided options
- let mut table_options = ctx.session_state().default_table_options().clone();
+ let mut table_options = ctx.session_state().default_table_options();
if let Some(format) = format {
table_options.set_config_format(format);
}
diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index 3b91abf8f3dcf..25d9b1681e516 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -24,13 +24,13 @@ use async_trait::async_trait;
use datafusion::catalog::Session;
use datafusion::common::{plan_err, Column};
-use datafusion::datasource::function::TableFunctionImpl;
use datafusion::datasource::TableProvider;
use datafusion::error::Result;
use datafusion::logical_expr::{Expr, Scalar};
use datafusion::physical_plan::memory::MemoryExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::scalar::ScalarValue;
+use datafusion_catalog::TableFunctionImpl;
use parquet::basic::ConvertedType;
use parquet::data_type::{ByteArray, FixedLenByteArray};
use parquet::file::reader::FileReader;
@@ -363,7 +363,7 @@ impl TableFunctionImpl for ParquetMetadataFunc {
Field::new("total_uncompressed_size", DataType::Int64, true),
]));
- // construct recordbatch from metadata
+ // construct record batch from metadata
let mut filename_arr = vec![];
let mut row_group_id_arr = vec![];
let mut row_group_num_rows_arr = vec![];
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index 4c6c352ff3395..52665df3751ea 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -19,12 +19,12 @@ use std::collections::HashMap;
use std::env;
use std::path::Path;
use std::process::ExitCode;
-use std::sync::{Arc, OnceLock};
+use std::sync::{Arc, LazyLock};
use datafusion::error::{DataFusionError, Result};
use datafusion::execution::context::SessionConfig;
-use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool};
-use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
+use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, MemoryPool};
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
use datafusion::prelude::SessionContext;
use datafusion_cli::catalog::DynamicObjectStoreCatalog;
use datafusion_cli::functions::ParquetMetadataFunc;
@@ -156,27 +156,22 @@ async fn main_inner() -> Result<()> {
session_config = session_config.with_batch_size(batch_size);
};
- let rt_config = RuntimeConfig::new();
- let rt_config =
- // set memory pool size
- if let Some(memory_limit) = args.memory_limit {
- // set memory pool type
- match args.mem_pool_type {
- PoolType::Fair => rt_config
- .with_memory_pool(Arc::new(FairSpillPool::new(memory_limit))),
- PoolType::Greedy => rt_config
- .with_memory_pool(Arc::new(GreedyMemoryPool::new(memory_limit)))
- }
- } else {
- rt_config
+ let mut rt_builder = RuntimeEnvBuilder::new();
+ // set memory pool size
+ if let Some(memory_limit) = args.memory_limit {
+ // set memory pool type
+ let pool: Arc = match args.mem_pool_type {
+ PoolType::Fair => Arc::new(FairSpillPool::new(memory_limit)),
+ PoolType::Greedy => Arc::new(GreedyMemoryPool::new(memory_limit)),
};
+ rt_builder = rt_builder.with_memory_pool(pool)
+ }
- let runtime_env = create_runtime_env(rt_config.clone())?;
+ let runtime_env = rt_builder.build_arc()?;
// enable dynamic file query
- let ctx =
- SessionContext::new_with_config_rt(session_config.clone(), Arc::new(runtime_env))
- .enable_url_table();
+ let ctx = SessionContext::new_with_config_rt(session_config, runtime_env)
+ .enable_url_table();
ctx.refresh_catalogs().await?;
// install dynamic catalog provider that can register required object stores
ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new(
@@ -231,10 +226,6 @@ async fn main_inner() -> Result<()> {
Ok(())
}
-fn create_runtime_env(rn_config: RuntimeConfig) -> Result {
- RuntimeEnv::try_new(rn_config)
-}
-
fn parse_valid_file(dir: &str) -> Result {
if Path::new(dir).is_file() {
Ok(dir.to_string())
@@ -288,9 +279,8 @@ impl ByteUnit {
}
fn extract_memory_pool_size(size: &str) -> Result {
- fn byte_suffixes() -> &'static HashMap<&'static str, ByteUnit> {
- static BYTE_SUFFIXES: OnceLock> = OnceLock::new();
- BYTE_SUFFIXES.get_or_init(|| {
+ static BYTE_SUFFIXES: LazyLock> =
+ LazyLock::new(|| {
let mut m = HashMap::new();
m.insert("b", ByteUnit::Byte);
m.insert("k", ByteUnit::KiB);
@@ -302,23 +292,20 @@ fn extract_memory_pool_size(size: &str) -> Result {
m.insert("t", ByteUnit::TiB);
m.insert("tb", ByteUnit::TiB);
m
- })
- }
+ });
- fn suffix_re() -> &'static regex::Regex {
- static SUFFIX_REGEX: OnceLock = OnceLock::new();
- SUFFIX_REGEX.get_or_init(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap())
- }
+ static SUFFIX_REGEX: LazyLock =
+ LazyLock::new(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap());
let lower = size.to_lowercase();
- if let Some(caps) = suffix_re().captures(&lower) {
+ if let Some(caps) = SUFFIX_REGEX.captures(&lower) {
let num_str = caps.get(1).unwrap().as_str();
let num = num_str.parse::().map_err(|_| {
format!("Invalid numeric value in memory pool size '{}'", size)
})?;
let suffix = caps.get(2).map(|m| m.as_str()).unwrap_or("b");
- let unit = byte_suffixes()
+ let unit = &BYTE_SUFFIXES
.get(suffix)
.ok_or_else(|| format!("Invalid memory pool size '{}'", size))?;
let memory_pool_size = usize::try_from(unit.multiplier())
diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs
index e8d60e4f0926c..045c924e50370 100644
--- a/datafusion-cli/src/object_storage.rs
+++ b/datafusion-cli/src/object_storage.rs
@@ -32,7 +32,7 @@ use aws_credential_types::provider::ProvideCredentials;
use object_store::aws::{AmazonS3Builder, AwsCredential};
use object_store::gcp::GoogleCloudStorageBuilder;
use object_store::http::HttpBuilder;
-use object_store::{CredentialProvider, ObjectStore};
+use object_store::{ClientOptions, CredentialProvider, ObjectStore};
use url::Url;
pub async fn get_s3_object_store_builder(
@@ -437,6 +437,7 @@ pub(crate) async fn get_object_store(
}
"http" | "https" => Arc::new(
HttpBuilder::new()
+ .with_client_options(ClientOptions::new().with_allow_http(true))
.with_url(url.origin().ascii_serialization())
.build()?,
),
@@ -471,12 +472,13 @@ mod tests {
#[tokio::test]
async fn s3_object_store_builder() -> Result<()> {
- let access_key_id = "fake_access_key_id";
- let secret_access_key = "fake_secret_access_key";
+ // "fake" is uppercase to ensure the values are not lowercased when parsed
+ let access_key_id = "FAKE_access_key_id";
+ let secret_access_key = "FAKE_secret_access_key";
let region = "fake_us-east-2";
let endpoint = "endpoint33";
- let session_token = "fake_session_token";
- let location = "s3://bucket/path/file.parquet";
+ let session_token = "FAKE_session_token";
+ let location = "s3://bucket/path/FAKE/file.parquet";
let table_url = ListingTableUrl::parse(location)?;
let scheme = table_url.scheme();
@@ -495,7 +497,7 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
ctx.register_table_options_extension_from_scheme(scheme);
- let mut table_options = ctx.state().default_table_options().clone();
+ let mut table_options = ctx.state().default_table_options();
table_options.alter_with_string_hash_map(&cmd.options)?;
let aws_options = table_options.extensions.get::().unwrap();
let builder =
@@ -540,7 +542,7 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
ctx.register_table_options_extension_from_scheme(scheme);
- let mut table_options = ctx.state().default_table_options().clone();
+ let mut table_options = ctx.state().default_table_options();
table_options.alter_with_string_hash_map(&cmd.options)?;
let aws_options = table_options.extensions.get::().unwrap();
let err = get_s3_object_store_builder(table_url.as_ref(), aws_options)
@@ -566,7 +568,7 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
ctx.register_table_options_extension_from_scheme(scheme);
- let mut table_options = ctx.state().default_table_options().clone();
+ let mut table_options = ctx.state().default_table_options();
table_options.alter_with_string_hash_map(&cmd.options)?;
let aws_options = table_options.extensions.get::().unwrap();
// ensure this isn't an error
@@ -594,7 +596,7 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
ctx.register_table_options_extension_from_scheme(scheme);
- let mut table_options = ctx.state().default_table_options().clone();
+ let mut table_options = ctx.state().default_table_options();
table_options.alter_with_string_hash_map(&cmd.options)?;
let aws_options = table_options.extensions.get::().unwrap();
let builder = get_oss_object_store_builder(table_url.as_ref(), aws_options)?;
@@ -631,7 +633,7 @@ mod tests {
if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan {
ctx.register_table_options_extension_from_scheme(scheme);
- let mut table_options = ctx.state().default_table_options().clone();
+ let mut table_options = ctx.state().default_table_options();
table_options.alter_with_string_hash_map(&cmd.options)?;
let gcp_options = table_options.extensions.get::().unwrap();
let builder = get_gcs_object_store_builder(table_url.as_ref(), gcp_options)?;
diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs
index 92cb106d622bf..1fc949593512b 100644
--- a/datafusion-cli/src/print_format.rs
+++ b/datafusion-cli/src/print_format.rs
@@ -26,7 +26,7 @@ use arrow::datatypes::SchemaRef;
use arrow::json::{ArrayWriter, LineDelimitedWriter};
use arrow::record_batch::RecordBatch;
use arrow::util::pretty::pretty_format_batches_with_options;
-use datafusion::common::format::DEFAULT_FORMAT_OPTIONS;
+use datafusion::common::format::DEFAULT_CLI_FORMAT_OPTIONS;
use datafusion::error::Result;
/// Allow records to be printed in different formats
@@ -133,7 +133,7 @@ fn format_batches_with_maxrows(
let formatted = pretty_format_batches_with_options(
&filtered_batches,
- &DEFAULT_FORMAT_OPTIONS,
+ &DEFAULT_CLI_FORMAT_OPTIONS,
)?;
if over_limit {
let mut formatted_str = format!("{}", formatted);
@@ -145,7 +145,7 @@ fn format_batches_with_maxrows(
}
MaxRows::Unlimited => {
let formatted =
- pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS)?;
+ pretty_format_batches_with_options(batches, &DEFAULT_CLI_FORMAT_OPTIONS)?;
writeln!(writer, "{}", formatted)?;
}
}
@@ -201,7 +201,7 @@ impl PrintFormat {
let empty_batch = RecordBatch::new_empty(schema);
let formatted = pretty_format_batches_with_options(
&[empty_batch],
- &DEFAULT_FORMAT_OPTIONS,
+ &DEFAULT_CLI_FORMAT_OPTIONS,
)?;
writeln!(writer, "{}", formatted)?;
}
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
index f430a87e190db..d8aaad801e5c0 100644
--- a/datafusion-examples/Cargo.toml
+++ b/datafusion-examples/Cargo.toml
@@ -60,8 +60,10 @@ async-trait = { workspace = true }
bytes = { workspace = true }
dashmap = { workspace = true }
datafusion = { workspace = true, default-features = true, features = ["avro"] }
+datafusion-catalog = { workspace = true }
datafusion-common = { workspace = true, default-features = true }
datafusion-expr = { workspace = true }
+datafusion-functions-window-common = { workspace = true }
datafusion-optimizer = { workspace = true, default-features = true }
datafusion-physical-expr = { workspace = true, default-features = true }
datafusion-proto = { workspace = true }
@@ -70,12 +72,8 @@ env_logger = { workspace = true }
futures = { workspace = true }
log = { workspace = true }
mimalloc = { version = "0.1", default-features = false }
-num_cpus = { workspace = true }
object_store = { workspace = true, features = ["aws", "http"] }
prost = { workspace = true }
-prost-derive = { workspace = true }
-serde = { version = "1.0.136", features = ["derive"] }
-serde_json = { workspace = true }
tempfile = { workspace = true }
test-utils = { path = "../test-utils" }
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index 5f032c3e9cfff..b5f82b4d5140e 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -22,7 +22,7 @@
This crate includes end to end, highly commented examples of how to use
various DataFusion APIs to help you get started.
-## Prerequisites:
+## Prerequisites
Run `git submodule update --init` to init test files.
@@ -54,22 +54,19 @@ cargo run --example dataframe
- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
- [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization
- [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file
+- [`csv_json_opener.rs`](examples/csv_json_opener.rs): Use low level `FileOpener` APIs to read CSV/JSON into Arrow `RecordBatch`es
- [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider)
- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format
- [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
-- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame against a local parquet file
-- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory
-- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame
-- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde
-- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze `Expr`s
+- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
+- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs
+- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s
- [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks.
- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
- [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
-- [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
- [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom OptimizerRule to replace certain predicates
- [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries
-- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
- [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution
- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`.
- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan`
@@ -78,12 +75,14 @@ cargo run --example dataframe
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
+- [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network)
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
- [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with DataFusion structures
- [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) from sql strings
- [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a custom SQL dialect on top of `DFParser`
+- [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory `RecordBatch`es, local Parquet files)q
- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs
index f6860bb5b87a5..28a3a2f1de09e 100644
--- a/datafusion-examples/examples/advanced_parquet_index.rs
+++ b/datafusion-examples/examples/advanced_parquet_index.rs
@@ -82,7 +82,7 @@ use url::Url;
/// Specifically, this example illustrates how to:
/// 1. Use [`ParquetFileReaderFactory`] to avoid re-reading parquet metadata on each query
/// 2. Use [`PruningPredicate`] for predicate analysis
-/// 3. Pass a row group selection to [`ParuetExec`]
+/// 3. Pass a row group selection to [`ParquetExec`]
/// 4. Pass a row selection (within a row group) to [`ParquetExec`]
///
/// Note this is a *VERY* low level example for people who want to build their
@@ -211,7 +211,7 @@ async fn main() -> Result<()> {
//
// Note: in order to prune pages, the Page Index must be loaded and the
// ParquetExec will load it on demand if not present. To avoid a second IO
- // during query, this example loaded the Page Index pre-emptively by setting
+ // during query, this example loaded the Page Index preemptively by setting
// `ArrowReader::with_page_index` in `IndexedFile::try_new`
provider.set_use_row_selection(true);
println!("** Select data, predicate `id = 950`");
@@ -229,9 +229,9 @@ async fn main() -> Result<()> {
/// `file1.parquet` contains values `0..1000`
#[derive(Debug)]
pub struct IndexTableProvider {
- /// Where the file is stored (cleanup on drop)
- #[allow(dead_code)]
- tmpdir: TempDir,
+ /// Pointer to temporary file storage. Keeping it in scope to prevent temporary folder
+ /// to be deleted prematurely
+ _tmpdir: TempDir,
/// The file that is being read.
indexed_file: IndexedFile,
/// The underlying object store
@@ -250,7 +250,7 @@ impl IndexTableProvider {
Ok(Self {
indexed_file,
- tmpdir,
+ _tmpdir: tmpdir,
object_store,
use_row_selections: AtomicBool::new(false),
})
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/advanced_udaf.rs
index 1259f90d64496..a914cea4a928a 100644
--- a/datafusion-examples/examples/advanced_udaf.rs
+++ b/datafusion-examples/examples/advanced_udaf.rs
@@ -31,7 +31,9 @@ use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::{cast::as_float64_array, ScalarValue};
use datafusion_expr::{
- function::{AccumulatorArgs, StateFieldsArgs},
+ expr::AggregateFunction,
+ function::{AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs},
+ simplify::SimplifyInfo,
Accumulator, AggregateUDF, AggregateUDFImpl, GroupsAccumulator, Signature,
};
@@ -193,44 +195,10 @@ impl Accumulator for GeometricMean {
}
fn size(&self) -> usize {
- std::mem::size_of_val(self)
+ size_of_val(self)
}
}
-// create local session context with an in-memory table
-fn create_context() -> Result {
- use datafusion::datasource::MemTable;
- // define a schema.
- let schema = Arc::new(Schema::new(vec![
- Field::new("a", DataType::Float32, false),
- Field::new("b", DataType::Float32, false),
- ]));
-
- // define data in two partitions
- let batch1 = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0])),
- Arc::new(Float32Array::from(vec![2.0, 2.0, 2.0])),
- ],
- )?;
- let batch2 = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Float32Array::from(vec![64.0])),
- Arc::new(Float32Array::from(vec![2.0])),
- ],
- )?;
-
- // declare a new context. In spark API, this corresponds to a new spark SQLsession
- let ctx = SessionContext::new();
-
- // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
- let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?;
- ctx.register_table("t", Arc::new(provider))?;
- Ok(ctx)
-}
-
// Define a `GroupsAccumulator` for GeometricMean
/// which handles accumulator state for multiple groups at once.
/// This API is significantly more complicated than `Accumulator`, which manages
@@ -394,40 +362,151 @@ impl GroupsAccumulator for GeometricMeanGroupsAccumulator {
}
fn size(&self) -> usize {
- self.counts.capacity() * std::mem::size_of::()
- + self.prods.capacity() * std::mem::size_of::()
+ self.counts.capacity() * size_of::()
+ + self.prods.capacity() * size_of::()
+ }
+}
+
+/// This example shows how to use the AggregateUDFImpl::simplify API to simplify/replace user
+/// defined aggregate function with a different expression which is defined in the `simplify` method.
+#[derive(Debug, Clone)]
+struct SimplifiedGeoMeanUdaf {
+ signature: Signature,
+}
+
+impl SimplifiedGeoMeanUdaf {
+ fn new() -> Self {
+ Self {
+ signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+ }
+ }
+}
+
+impl AggregateUDFImpl for SimplifiedGeoMeanUdaf {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "simplified_geo_mean"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result {
+ Ok(DataType::Float64)
+ }
+
+ fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result> {
+ unimplemented!("should not be invoked")
+ }
+
+ fn state_fields(&self, _args: StateFieldsArgs) -> Result> {
+ unimplemented!("should not be invoked")
+ }
+
+ fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+ true
+ }
+
+ fn create_groups_accumulator(
+ &self,
+ _args: AccumulatorArgs,
+ ) -> Result> {
+ unimplemented!("should not get here");
+ }
+
+ /// Optionally replaces a UDAF with another expression during query optimization.
+ fn simplify(&self) -> Option {
+ let simplify = |aggregate_function: AggregateFunction, _: &dyn SimplifyInfo| {
+ // Replaces the UDAF with `GeoMeanUdaf` as a placeholder example to demonstrate the `simplify` method.
+ // In real-world scenarios, you might create UDFs from built-in expressions.
+ Ok(Expr::AggregateFunction(AggregateFunction::new_udf(
+ Arc::new(AggregateUDF::from(GeoMeanUdaf::new())),
+ aggregate_function.args,
+ aggregate_function.distinct,
+ aggregate_function.filter,
+ aggregate_function.order_by,
+ aggregate_function.null_treatment,
+ )))
+ };
+ Some(Box::new(simplify))
}
}
+// create local session context with an in-memory table
+fn create_context() -> Result {
+ use datafusion::datasource::MemTable;
+ // define a schema.
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("a", DataType::Float32, false),
+ Field::new("b", DataType::Float32, false),
+ ]));
+
+ // define data in two partitions
+ let batch1 = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0])),
+ Arc::new(Float32Array::from(vec![2.0, 2.0, 2.0])),
+ ],
+ )?;
+ let batch2 = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(Float32Array::from(vec![64.0])),
+ Arc::new(Float32Array::from(vec![2.0])),
+ ],
+ )?;
+
+ // declare a new context. In spark API, this corresponds to a new spark SQLsession
+ let ctx = SessionContext::new();
+
+ // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+ let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?;
+ ctx.register_table("t", Arc::new(provider))?;
+ Ok(ctx)
+}
+
#[tokio::main]
async fn main() -> Result<()> {
let ctx = create_context()?;
- // create the AggregateUDF
- let geometric_mean = AggregateUDF::from(GeoMeanUdaf::new());
- ctx.register_udaf(geometric_mean.clone());
+ let geo_mean_udf = AggregateUDF::from(GeoMeanUdaf::new());
+ let simplified_geo_mean_udf = AggregateUDF::from(SimplifiedGeoMeanUdaf::new());
+
+ for (udf, udf_name) in [
+ (geo_mean_udf, "geo_mean"),
+ (simplified_geo_mean_udf, "simplified_geo_mean"),
+ ] {
+ ctx.register_udaf(udf.clone());
- let sql_df = ctx.sql("SELECT geo_mean(a) FROM t group by b").await?;
- sql_df.show().await?;
+ let sql_df = ctx
+ .sql(&format!("SELECT {}(a) FROM t GROUP BY b", udf_name))
+ .await?;
+ sql_df.show().await?;
- // get a DataFrame from the context
- // this table has 1 column `a` f32 with values {2,4,8,64}, whose geometric mean is 8.0.
- let df = ctx.table("t").await?;
+ // get a DataFrame from the context
+ // this table has 1 column `a` f32 with values {2,4,8,64}, whose geometric mean is 8.0.
+ let df = ctx.table("t").await?;
- // perform the aggregation
- let df = df.aggregate(vec![], vec![geometric_mean.call(vec![col("a")])])?;
+ // perform the aggregation
+ let df = df.aggregate(vec![], vec![udf.call(vec![col("a")])])?;
- // note that "a" is f32, not f64. DataFusion coerces it to match the UDAF's signature.
+ // note that "a" is f32, not f64. DataFusion coerces it to match the UDAF's signature.
- // execute the query
- let results = df.collect().await?;
+ // execute the query
+ let results = df.collect().await?;
- // downcast the array to the expected type
- let result = as_float64_array(results[0].column(0))?;
+ // downcast the array to the expected type
+ let result = as_float64_array(results[0].column(0))?;
- // verify that the calculation is correct
- assert!((result.value(0) - 8.0).abs() < f64::EPSILON);
- println!("The geometric mean of [2,4,8,64] is {}", result.value(0));
+ // verify that the calculation is correct
+ assert!((result.value(0) - 8.0).abs() < f64::EPSILON);
+ println!("The geometric mean of [2,4,8,64] is {}", result.value(0));
+ }
Ok(())
}
diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/advanced_udf.rs
index 22d37043e4731..0aa2b3f370e97 100644
--- a/datafusion-examples/examples/advanced_udf.rs
+++ b/datafusion-examples/examples/advanced_udf.rs
@@ -27,9 +27,11 @@ use arrow::record_batch::RecordBatch;
use datafusion::error::Result;
use datafusion::logical_expr::Volatility;
use datafusion::prelude::*;
-use datafusion_common::{internal_err, ScalarValue};
+use datafusion_common::{exec_err, internal_err, ScalarValue};
use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::{ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+};
/// This example shows how to use the full ScalarUDFImpl API to implement a user
/// defined function. As in the `simple_udf.rs` example, this struct implements
@@ -83,21 +85,29 @@ impl ScalarUDFImpl for PowUdf {
Ok(DataType::Float64)
}
- /// This is the function that actually calculates the results.
+ /// This function actually calculates the results of the scalar function.
+ ///
+ /// This is the same way that functions provided with DataFusion are invoked,
+ /// which permits important special cases:
///
- /// This is the same way that functions built into DataFusion are invoked,
- /// which permits important special cases when one or both of the arguments
- /// are single values (constants). For example `pow(a, 2)`
+ ///1. When one or both of the arguments are single values (constants).
+ /// For example `pow(a, 2)`
+ /// 2. When the input arrays can be reused (avoid allocating a new output array)
///
/// However, it also means the implementation is more complex than when
/// using `create_udf`.
- fn invoke(&self, args: &[ColumnarValue]) -> Result {
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result {
+ // The other fields of the `args` struct are used for more specialized
+ // uses, and are not needed in this example
+ let ScalarFunctionArgs { mut args, .. } = args;
// DataFusion has arranged for the correct inputs to be passed to this
// function, but we check again to make sure
assert_eq!(args.len(), 2);
- let (base, exp) = (&args[0], &args[1]);
- assert_eq!(base.data_type(), &DataType::Float64);
- assert_eq!(exp.data_type(), &DataType::Float64);
+ // take ownership of arguments by popping in reverse order
+ let exp = args.pop().unwrap();
+ let base = args.pop().unwrap();
+ assert_eq!(*base.data_type(), DataType::Float64);
+ assert_eq!(*exp.data_type(), DataType::Float64);
match (base, exp) {
// For demonstration purposes we also implement the scalar / scalar
@@ -109,21 +119,17 @@ impl ScalarUDFImpl for PowUdf {
// this path once during planning, and simply use the result during
// execution.
(ColumnarValue::Scalar(base), ColumnarValue::Scalar(exp)) => {
- match (base.value(), exp.value()) {
- (ScalarValue::Float64(base), ScalarValue::Float64(exp)) => {
- // compute the output. Note DataFusion treats `None` as NULL.
- let res = match (base, exp) {
- (Some(base), Some(exp)) => Some(base.powf(*exp)),
- // one or both arguments were NULL
- _ => None,
- };
- Ok(ColumnarValue::from(ScalarValue::from(res)))
- }
- _ => {
- internal_err!("Invalid argument types to pow function")
- }
- }
+ let res = match (base.value(), exp.value()) {
+ // compute the output. Note DataFusion treats `None` as NULL.
+ (
+ ScalarValue::Float64(Some(base)),
+ ScalarValue::Float64(Some(exp)),
+ ) => Some(base.powf(*exp)),
+ _ => None, // one or both arguments were NULL
+ };
+ Ok(ColumnarValue::from(ScalarValue::from(res)))
}
+
// special case if the exponent is a constant
(ColumnarValue::Array(base_array), ColumnarValue::Scalar(exp)) => {
let result_array = match exp.value() {
@@ -147,24 +153,28 @@ impl ScalarUDFImpl for PowUdf {
Ok(ColumnarValue::Array(result_array))
}
- // special case if the base is a constant (note this code is quite
- // similar to the previous case, so we omit comments)
+ // special case if the base is a constant.
+ //
+ // Note this case is very similar to the previous case, so we could
+ // use the same pattern. However, for this case we demonstrate an
+ // even more advanced pattern to potentially avoid allocating a new array
(ColumnarValue::Scalar(base), ColumnarValue::Array(exp_array)) => {
let res = match base.value() {
ScalarValue::Float64(None) => {
new_null_array(exp_array.data_type(), exp_array.len())
}
ScalarValue::Float64(Some(base)) => {
- let exp_array = exp_array.as_primitive::();
- let res: Float64Array =
- compute::unary(exp_array, |exp| base.powf(exp));
- Arc::new(res)
+ maybe_pow_in_place(*base, exp_array)?
}
- _ => return internal_err!("Invalid argument types to pow function"),
+ _ => return internal_err!("Invalid scalar argument to pow function"),
};
Ok(ColumnarValue::Array(res))
}
- // Both arguments are arrays so we have to perform the calculation for every row
+ // Both arguments are arrays so we have to perform the calculation
+ // for every row
+ //
+ // Note this could also be done in place using `binary_mut` as
+ // is done in `maybe_pow_in_place` but here we use binary for simplicity
(ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
let res: Float64Array = compute::binary(
base_array.as_primitive::(),
@@ -187,6 +197,52 @@ impl ScalarUDFImpl for PowUdf {
}
}
+/// Evaluate `base ^ exp` *without* allocating a new array, if possible
+fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result {
+ // Calling `unary` creates a new array for the results. Avoiding
+ // allocations is a common optimization in performance critical code.
+ // arrow-rs allows this optimization via the `unary_mut`
+ // and `binary_mut` kernels in certain cases
+ //
+ // These kernels can only be used if there are no other references to
+ // the arrays (exp_array has to be the last remaining reference).
+ let owned_array = exp_array
+ // as in the previous example, we first downcast to &Float64Array
+ .as_primitive::()
+ // non-obviously, we call clone here to get an owned `Float64Array`.
+ // Calling clone() is relatively inexpensive as it increments
+ // some ref counts but doesn't clone the data)
+ //
+ // Once we have the owned Float64Array we can drop the original
+ // exp_array (untyped) reference
+ .clone();
+
+ // We *MUST* drop the reference to `exp_array` explicitly so that
+ // owned_array is the only reference remaining in this function.
+ //
+ // Note that depending on the query there may still be other references
+ // to the underlying buffers, which would prevent reuse. The only way to
+ // know for sure is the result of `compute::unary_mut`
+ drop(exp_array);
+
+ // If we have the only reference, compute the result directly into the same
+ // allocation as was used for the input array
+ match compute::unary_mut(owned_array, |exp| base.powf(exp)) {
+ Err(_orig_array) => {
+ // unary_mut will return the original array if there are other
+ // references into the underling buffer (and thus reuse is
+ // impossible)
+ //
+ // In a real implementation, this case should fall back to
+ // calling `unary` and allocate a new array; In this example
+ // we will return an error for demonstration purposes
+ exec_err!("Could not reuse array for maybe_pow_in_place")
+ }
+ // a result of OK means the operation was run successfully
+ Ok(res) => Ok(Arc::new(res)),
+ }
+}
+
/// In this example we register `PowUdf` as a user defined function
/// and invoke it via the DataFrame API and SQL
#[tokio::main]
@@ -211,9 +267,29 @@ async fn main() -> Result<()> {
// print the results
df.show().await?;
- // You can also invoke both pow(2, 10) and its alias my_pow(a, b) using SQL
- let sql_df = ctx.sql("SELECT pow(2, 10), my_pow(a, b) FROM t").await?;
- sql_df.show().await?;
+ // You can also invoke both pow(2, 10) and its alias my_pow(a, b) using SQL
+ ctx.sql("SELECT pow(2, 10), my_pow(a, b) FROM t")
+ .await?
+ .show()
+ .await?;
+
+ // You can also invoke pow_in_place by passing a constant base and a
+ // column `a` as the exponent . If there is only a single
+ // reference to `a` the code works well
+ ctx.sql("SELECT pow(2, a) FROM t").await?.show().await?;
+
+ // However, if there are multiple references to `a` in the evaluation
+ // the array storage can not be reused
+ let err = ctx
+ .sql("SELECT pow(2, a), pow(3, a) FROM t")
+ .await?
+ .show()
+ .await
+ .unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Execution error: Could not reuse array for maybe_pow_in_place"
+ );
Ok(())
}
diff --git a/datafusion-examples/examples/advanced_udwf.rs b/datafusion-examples/examples/advanced_udwf.rs
index fd1b84070cf68..49e890467d21e 100644
--- a/datafusion-examples/examples/advanced_udwf.rs
+++ b/datafusion-examples/examples/advanced_udwf.rs
@@ -24,12 +24,16 @@ use arrow::{
};
use arrow_schema::Field;
use datafusion::error::Result;
+use datafusion::functions_aggregate::average::avg_udaf;
use datafusion::prelude::*;
use datafusion_common::ScalarValue;
-use datafusion_expr::function::WindowUDFFieldArgs;
+use datafusion_expr::expr::WindowFunction;
+use datafusion_expr::function::{WindowFunctionSimplification, WindowUDFFieldArgs};
+use datafusion_expr::simplify::SimplifyInfo;
use datafusion_expr::{
- PartitionEvaluator, Signature, WindowFrame, WindowUDF, WindowUDFImpl,
+ Expr, PartitionEvaluator, Signature, WindowFrame, WindowUDF, WindowUDFImpl,
};
+use datafusion_functions_window_common::partition::PartitionEvaluatorArgs;
/// This example shows how to use the full WindowUDFImpl API to implement a user
/// defined window function. As in the `simple_udwf.rs` example, this struct implements
@@ -74,7 +78,10 @@ impl WindowUDFImpl for SmoothItUdf {
/// Create a `PartitionEvaluator` to evaluate this function on a new
/// partition.
- fn partition_evaluator(&self) -> Result> {
+ fn partition_evaluator(
+ &self,
+ _partition_evaluator_args: PartitionEvaluatorArgs,
+ ) -> Result> {
Ok(Box::new(MyPartitionEvaluator::new()))
}
@@ -138,6 +145,67 @@ impl PartitionEvaluator for MyPartitionEvaluator {
}
}
+/// This UDWF will show how to use the WindowUDFImpl::simplify() API
+#[derive(Debug, Clone)]
+struct SimplifySmoothItUdf {
+ signature: Signature,
+}
+
+impl SimplifySmoothItUdf {
+ fn new() -> Self {
+ Self {
+ signature: Signature::exact(
+ // this function will always take one arguments of type f64
+ vec![DataType::Float64],
+ // this function is deterministic and will always return the same
+ // result for the same input
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+impl WindowUDFImpl for SimplifySmoothItUdf {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "simplify_smooth_it"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn partition_evaluator(
+ &self,
+ _partition_evaluator_args: PartitionEvaluatorArgs,
+ ) -> Result> {
+ todo!()
+ }
+
+ /// this function will simplify `SimplifySmoothItUdf` to `AggregateUDF` for `Avg`
+ /// default implementation will not be called (left as `todo!()`)
+ fn simplify(&self) -> Option {
+ let simplify = |window_function: WindowFunction, _: &dyn SimplifyInfo| {
+ Ok(Expr::WindowFunction(WindowFunction {
+ fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF(avg_udaf()),
+ args: window_function.args,
+ partition_by: window_function.partition_by,
+ order_by: window_function.order_by,
+ window_frame: window_function.window_frame,
+ null_treatment: window_function.null_treatment,
+ }))
+ };
+
+ Some(Box::new(simplify))
+ }
+
+ fn field(&self, field_args: WindowUDFFieldArgs) -> Result {
+ Ok(Field::new(field_args.name(), DataType::Float64, true))
+ }
+}
+
// create local execution context with `cars.csv` registered as a table named `cars`
async fn create_context() -> Result {
// declare a new context. In spark API, this corresponds to a new spark SQL session
@@ -158,12 +226,15 @@ async fn main() -> Result<()> {
let smooth_it = WindowUDF::from(SmoothItUdf::new());
ctx.register_udwf(smooth_it.clone());
- // Use SQL to run the new window function
+ let simplify_smooth_it = WindowUDF::from(SimplifySmoothItUdf::new());
+ ctx.register_udwf(simplify_smooth_it.clone());
+
+ // Use SQL to retrieve entire table
let df = ctx.sql("SELECT * from cars").await?;
// print the results
df.show().await?;
- // Use SQL to run the new window function:
+ // Use SQL to run smooth_it:
//
// `PARTITION BY car`:each distinct value of car (red, and green)
// should be treated as a separate partition (and will result in
@@ -197,7 +268,7 @@ async fn main() -> Result<()> {
// print the results
df.show().await?;
- // this time, call the new widow function with an explicit
+ // this time, call the function with an explicit
// window so evaluate will be invoked with each window.
//
// `ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING`: each invocation
@@ -228,5 +299,22 @@ async fn main() -> Result<()> {
// print the results
df.show().await?;
+ // Use SQL to run simplify_smooth_it
+ let df = ctx
+ .sql(
+ "SELECT \
+ car, \
+ speed, \
+ simplify_smooth_it(speed) OVER (PARTITION BY car ORDER BY time) AS smooth_speed,\
+ time \
+ from cars \
+ ORDER BY \
+ car",
+ )
+ .await?;
+
+ // print the results
+ df.show().await?;
+
Ok(())
}
diff --git a/datafusion-examples/examples/analyzer_rule.rs b/datafusion-examples/examples/analyzer_rule.rs
index bd067be97b8b3..aded64ed4105d 100644
--- a/datafusion-examples/examples/analyzer_rule.rs
+++ b/datafusion-examples/examples/analyzer_rule.rs
@@ -138,7 +138,7 @@ impl AnalyzerRule for RowLevelAccessControl {
fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result {
// use the TreeNode API to recursively walk the LogicalPlan tree
// and all of its children (inputs)
- let transfomed_plan = plan.transform(|plan| {
+ let transformed_plan = plan.transform(|plan| {
// This closure is called for each LogicalPlan node
// if it is a Scan node, add a filter to remove all managers
if is_employee_table_scan(&plan) {
@@ -166,7 +166,7 @@ impl AnalyzerRule for RowLevelAccessControl {
//
// This example does not need the value of either flag, so simply
// extract the LogicalPlan "data"
- Ok(transfomed_plan.data)
+ Ok(transformed_plan.data)
}
fn name(&self) -> &str {
diff --git a/datafusion-examples/examples/catalog.rs b/datafusion-examples/examples/catalog.rs
index f40f1dfb5a159..655438b78b9fa 100644
--- a/datafusion-examples/examples/catalog.rs
+++ b/datafusion-examples/examples/catalog.rs
@@ -46,11 +46,11 @@ async fn main() -> Result<()> {
let ctx = SessionContext::new();
let state = ctx.state();
- let cataloglist = Arc::new(CustomCatalogProviderList::new());
+ let catalog_list = Arc::new(CustomCatalogProviderList::new());
// use our custom catalog list for context. each context has a single catalog list.
// context will by default have [`MemoryCatalogProviderList`]
- ctx.register_catalog_list(cataloglist.clone());
+ ctx.register_catalog_list(catalog_list.clone());
// initialize our catalog and schemas
let catalog = DirCatalog::new();
@@ -81,7 +81,7 @@ async fn main() -> Result<()> {
ctx.register_catalog("dircat", Arc::new(catalog));
{
// catalog was passed down into our custom catalog list since we override the ctx's default
- let catalogs = cataloglist.catalogs.read().unwrap();
+ let catalogs = catalog_list.catalogs.read().unwrap();
assert!(catalogs.contains_key("dircat"));
};
@@ -144,8 +144,8 @@ impl DirSchema {
async fn create(state: &SessionState, opts: DirSchemaOpts<'_>) -> Result> {
let DirSchemaOpts { ext, dir, format } = opts;
let mut tables = HashMap::new();
- let direntries = std::fs::read_dir(dir).unwrap();
- for res in direntries {
+ let dir_entries = std::fs::read_dir(dir).unwrap();
+ for res in dir_entries {
let entry = res.unwrap();
let filename = entry.file_name().to_str().unwrap().to_string();
if !filename.ends_with(ext) {
diff --git a/datafusion-examples/examples/config_extension.rs b/datafusion-examples/examples/config_extension.rs
deleted file mode 100644
index b9f83f91ce564..0000000000000
--- a/datafusion-examples/examples/config_extension.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! This example demonstrates how to extend the DataFusion configs with custom extensions.
-
-use datafusion::{
- common::{config::ConfigExtension, extensions_options},
- config::ConfigOptions,
-};
-
-extensions_options! {
- /// My own config options.
- pub struct MyConfig {
- /// Should "foo" be replaced by "bar"?
- pub foo_to_bar: bool, default = true
-
- /// How many "baz" should be created?
- pub baz_count: usize, default = 1337
- }
-}
-
-impl ConfigExtension for MyConfig {
- const PREFIX: &'static str = "my_config";
-}
-
-fn main() {
- // set up config struct and register extension
- let mut config = ConfigOptions::default();
- config.extensions.insert(MyConfig::default());
-
- // overwrite config default
- config.set("my_config.baz_count", "42").unwrap();
-
- // check config state
- let my_config = config.extensions.get::().unwrap();
- assert!(my_config.foo_to_bar,);
- assert_eq!(my_config.baz_count, 42,);
-}
diff --git a/datafusion-examples/examples/csv_opener.rs b/datafusion-examples/examples/csv_json_opener.rs
similarity index 50%
rename from datafusion-examples/examples/csv_opener.rs
rename to datafusion-examples/examples/csv_json_opener.rs
index e7b7ead109bc0..334e4c83404ff 100644
--- a/datafusion-examples/examples/csv_opener.rs
+++ b/datafusion-examples/examples/csv_json_opener.rs
@@ -15,28 +15,36 @@
// specific language governing permissions and limitations
// under the License.
-use std::{sync::Arc, vec};
+use std::sync::Arc;
+use arrow_schema::{DataType, Field, Schema};
use datafusion::{
assert_batches_eq,
datasource::{
file_format::file_compression_type::FileCompressionType,
listing::PartitionedFile,
object_store::ObjectStoreUrl,
- physical_plan::{CsvConfig, CsvOpener, FileScanConfig, FileStream},
+ physical_plan::{CsvConfig, CsvOpener, FileScanConfig, FileStream, JsonOpener},
},
error::Result,
physical_plan::metrics::ExecutionPlanMetricsSet,
test_util::aggr_test_schema,
};
-
use futures::StreamExt;
-use object_store::local::LocalFileSystem;
+use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore};
-/// This example demonstrates a scanning against an Arrow data source (CSV) and
-/// fetching results
+/// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly
+/// read data from (CSV/JSON) into Arrow RecordBatches.
+///
+/// If you want to query data in CSV or JSON files, see the [`dataframe.rs`] and [`sql_query.rs`] examples
#[tokio::main]
async fn main() -> Result<()> {
+ csv_opener().await?;
+ json_opener().await?;
+ Ok(())
+}
+
+async fn csv_opener() -> Result<()> {
let object_store = Arc::new(LocalFileSystem::new());
let schema = aggr_test_schema();
@@ -59,18 +67,17 @@ async fn main() -> Result<()> {
let path = std::path::Path::new(&path).canonicalize()?;
- let scan_config =
- FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema.clone())
- .with_projection(Some(vec![12, 0]))
- .with_limit(Some(5))
- .with_file(PartitionedFile::new(path.display().to_string(), 10));
-
- let result =
- FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())
- .unwrap()
- .map(|b| b.unwrap())
- .collect::>()
- .await;
+ let scan_config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema)
+ .with_projection(Some(vec![12, 0]))
+ .with_limit(Some(5))
+ .with_file(PartitionedFile::new(path.display().to_string(), 10));
+
+ let mut result = vec![];
+ let mut stream =
+ FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?;
+ while let Some(batch) = stream.next().await.transpose()? {
+ result.push(batch);
+ }
assert_batches_eq!(
&[
"+--------------------------------+----+",
@@ -87,3 +94,54 @@ async fn main() -> Result<()> {
);
Ok(())
}
+
+async fn json_opener() -> Result<()> {
+ let object_store = InMemory::new();
+ let path = object_store::path::Path::from("demo.json");
+ let data = bytes::Bytes::from(
+ r#"{"num":5,"str":"test"}
+ {"num":2,"str":"hello"}
+ {"num":4,"str":"foo"}"#,
+ );
+
+ object_store.put(&path, data.into()).await?;
+
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("num", DataType::Int64, false),
+ Field::new("str", DataType::Utf8, false),
+ ]));
+
+ let projected = Arc::new(schema.clone().project(&[1, 0])?);
+
+ let opener = JsonOpener::new(
+ 8192,
+ projected,
+ FileCompressionType::UNCOMPRESSED,
+ Arc::new(object_store),
+ );
+
+ let scan_config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema)
+ .with_projection(Some(vec![1, 0]))
+ .with_limit(Some(5))
+ .with_file(PartitionedFile::new(path.to_string(), 10));
+
+ let mut stream =
+ FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())?;
+ let mut result = vec![];
+ while let Some(batch) = stream.next().await.transpose()? {
+ result.push(batch);
+ }
+ assert_batches_eq!(
+ &[
+ "+-------+-----+",
+ "| str | num |",
+ "+-------+-----+",
+ "| test | 5 |",
+ "| hello | 2 |",
+ "| foo | 4 |",
+ "+-------+-----+",
+ ],
+ &result
+ );
+ Ok(())
+}
diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs
index 0f7748b133650..bc865fac5a338 100644
--- a/datafusion-examples/examples/custom_datasource.rs
+++ b/datafusion-examples/examples/custom_datasource.rs
@@ -21,22 +21,23 @@ use std::fmt::{self, Debug, Formatter};
use std::sync::{Arc, Mutex};
use std::time::Duration;
+use async_trait::async_trait;
use datafusion::arrow::array::{UInt64Builder, UInt8Builder};
use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::datasource::{provider_as_source, TableProvider, TableType};
use datafusion::error::Result;
use datafusion::execution::context::TaskContext;
+use datafusion::logical_expr::LogicalPlanBuilder;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
use datafusion::physical_plan::memory::MemoryStream;
use datafusion::physical_plan::{
- project_schema, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan,
- Partitioning, PlanProperties, SendableRecordBatchStream,
+ project_schema, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
+ PlanProperties, SendableRecordBatchStream,
};
use datafusion::prelude::*;
-use datafusion_expr::LogicalPlanBuilder;
-use datafusion_physical_expr::EquivalenceProperties;
-use async_trait::async_trait;
use datafusion::catalog::Session;
use tokio::time::timeout;
@@ -110,7 +111,7 @@ struct CustomDataSourceInner {
}
impl Debug for CustomDataSource {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str("custom_db")
}
}
@@ -214,13 +215,14 @@ impl CustomExec {
PlanProperties::new(
eq_properties,
Partitioning::UnknownPartitioning(1),
- ExecutionMode::Bounded,
+ EmissionType::Incremental,
+ Boundedness::Bounded,
)
}
}
impl DisplayAs for CustomExec {
- fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> std::fmt::Result {
+ fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
write!(f, "CustomExec")
}
}
diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs
index 1d9b587f15b93..95168597ebaaf 100644
--- a/datafusion-examples/examples/custom_file_format.rs
+++ b/datafusion-examples/examples/custom_file_format.rs
@@ -74,10 +74,7 @@ impl FileFormat for TSVFileFormat {
"tsv".to_string()
}
- fn get_ext_with_compression(
- &self,
- c: &FileCompressionType,
- ) -> datafusion::error::Result {
+ fn get_ext_with_compression(&self, c: &FileCompressionType) -> Result {
if c == &FileCompressionType::UNCOMPRESSED {
Ok("tsv".to_string())
} else {
@@ -154,7 +151,7 @@ impl FileFormatFactory for TSVFileFactory {
&self,
state: &SessionState,
format_options: &std::collections::HashMap,
- ) -> Result> {
+ ) -> Result> {
let mut new_options = format_options.clone();
new_options.insert("format.delimiter".to_string(), "\t".to_string());
@@ -164,7 +161,7 @@ impl FileFormatFactory for TSVFileFactory {
Ok(tsv_file_format)
}
- fn default(&self) -> std::sync::Arc {
+ fn default(&self) -> Arc {
todo!()
}
diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs
index d7e0068ef88f4..91d62135b9135 100644
--- a/datafusion-examples/examples/dataframe.rs
+++ b/datafusion-examples/examples/dataframe.rs
@@ -15,90 +15,116 @@
// specific language governing permissions and limitations
// under the License.
+use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray};
use datafusion::arrow::datatypes::{DataType, Field, Schema};
+use datafusion::dataframe::DataFrameWriteOptions;
use datafusion::error::Result;
+use datafusion::functions_aggregate::average::avg;
+use datafusion::functions_aggregate::min_max::max;
use datafusion::prelude::*;
+use datafusion_common::config::CsvOptions;
+use datafusion_common::parsers::CompressionTypeVariant;
+use datafusion_common::DataFusionError;
+use datafusion_common::ScalarValue;
use std::fs::File;
use std::io::Write;
+use std::sync::Arc;
use tempfile::tempdir;
-/// This example demonstrates executing a simple query against an Arrow data source (Parquet) and
-/// fetching results, using the DataFrame trait
+/// This example demonstrates using DataFusion's DataFrame API
+///
+/// # Reading from different formats
+///
+/// * [read_parquet]: execute queries against parquet files
+/// * [read_csv]: execute queries against csv files
+/// * [read_memory]: execute queries against in-memory arrow data
+///
+/// # Writing out to local storage
+///
+/// The following examples demonstrate how to write a DataFrame to local
+/// storage. See `external_dependency/dataframe-to-s3.rs` for an example writing
+/// to a remote object store.
+///
+/// * [write_out]: write out a DataFrame to a table, parquet file, csv file, or json file
+///
+/// # Executing subqueries
+///
+/// * [where_scalar_subquery]: execute a scalar subquery
+/// * [where_in_subquery]: execute a subquery with an IN clause
+/// * [where_exist_subquery]: execute a subquery with an EXISTS clause
+///
+/// # Querying data
+///
+/// * [query_to_date]: execute queries against parquet files
#[tokio::main]
async fn main() -> Result<()> {
- // create local execution context
+ // The SessionContext is the main high level API for interacting with DataFusion
let ctx = SessionContext::new();
+ read_parquet(&ctx).await?;
+ read_csv(&ctx).await?;
+ read_memory(&ctx).await?;
+ write_out(&ctx).await?;
+ query_to_date().await?;
+ register_aggregate_test_data("t1", &ctx).await?;
+ register_aggregate_test_data("t2", &ctx).await?;
+ where_scalar_subquery(&ctx).await?;
+ where_in_subquery(&ctx).await?;
+ where_exist_subquery(&ctx).await?;
+ Ok(())
+}
+/// Use DataFrame API to
+/// 1. Read parquet files,
+/// 2. Show the schema
+/// 3. Select columns and rows
+async fn read_parquet(ctx: &SessionContext) -> Result<()> {
+ // Find the local path of "alltypes_plain.parquet"
let testdata = datafusion::test_util::parquet_test_data();
-
let filename = &format!("{testdata}/alltypes_plain.parquet");
- // define the query using the DataFrame trait
- let df = ctx
- .read_parquet(filename, ParquetReadOptions::default())
- .await?
- .select_columns(&["id", "bool_col", "timestamp_col"])?
- .filter(col("id").gt(lit(1)))?;
-
- // print the results
- df.show().await?;
-
- // create a csv file waiting to be written
- let dir = tempdir()?;
- let file_path = dir.path().join("example.csv");
- let file = File::create(&file_path)?;
- write_csv_file(file);
-
- // Reading CSV file with inferred schema example
- let csv_df =
- example_read_csv_file_with_inferred_schema(file_path.to_str().unwrap()).await;
- csv_df.show().await?;
-
- // Reading CSV file with defined schema
- let csv_df = example_read_csv_file_with_schema(file_path.to_str().unwrap()).await;
- csv_df.show().await?;
-
- // Reading PARQUET file and print describe
+ // Read the parquet files and show its schema using 'describe'
let parquet_df = ctx
.read_parquet(filename, ParquetReadOptions::default())
.await?;
- parquet_df.describe().await.unwrap().show().await?;
- let dyn_ctx = ctx.enable_url_table();
- let df = dyn_ctx
- .sql(&format!("SELECT * FROM '{}'", file_path.to_str().unwrap()))
+ // show its schema using 'describe'
+ parquet_df.clone().describe().await?.show().await?;
+
+ // Select three columns and filter the results
+ // so that only rows where id > 1 are returned
+ parquet_df
+ .select_columns(&["id", "bool_col", "timestamp_col"])?
+ .filter(col("id").gt(lit(1)))?
+ .show()
.await?;
- df.show().await?;
Ok(())
}
-// Function to create an test CSV file
-fn write_csv_file(mut file: File) {
- // Create the data to put into the csv file with headers
- let content = r#"id,time,vote,unixtime,rating
-a1,"10 6, 2013",3,1381017600,5.0
-a2,"08 9, 2013",2,1376006400,4.5"#;
- // write the data
- file.write_all(content.as_ref())
- .expect("Problem with writing file!");
-}
+/// Use the DataFrame API to
+/// 1. Read CSV files
+/// 2. Optionally specify schema
+async fn read_csv(ctx: &SessionContext) -> Result<()> {
+ // create example.csv file in a temporary directory
+ let dir = tempdir()?;
+ let file_path = dir.path().join("example.csv");
+ {
+ let mut file = File::create(&file_path)?;
+ // write CSV data
+ file.write_all(
+ r#"id,time,vote,unixtime,rating
+ a1,"10 6, 2013",3,1381017600,5.0
+ a2,"08 9, 2013",2,1376006400,4.5"#
+ .as_bytes(),
+ )?;
+ } // scope closes the file
+ let file_path = file_path.to_str().unwrap();
-// Example to read data from a csv file with inferred schema
-async fn example_read_csv_file_with_inferred_schema(file_path: &str) -> DataFrame {
- // Create a session context
- let ctx = SessionContext::new();
- // Register a lazy DataFrame using the context
- ctx.read_csv(file_path, CsvReadOptions::default())
- .await
- .unwrap()
-}
+ // You can read a CSV file and DataFusion will infer the schema automatically
+ let csv_df = ctx.read_csv(file_path, CsvReadOptions::default()).await?;
+ csv_df.show().await?;
-// Example to read csv file with a defined schema for the csv file
-async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame {
- // Create a session context
- let ctx = SessionContext::new();
- // Define the schema
+ // If you know the types of your data you can specify them explicitly
let schema = Schema::new(vec![
Field::new("id", DataType::Utf8, false),
Field::new("time", DataType::Utf8, false),
@@ -112,6 +138,206 @@ async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame {
schema: Some(&schema),
..Default::default()
};
- // Register a lazy DataFrame by using the context and option provider
- ctx.read_csv(file_path, csv_read_option).await.unwrap()
+ let csv_df = ctx.read_csv(file_path, csv_read_option).await?;
+ csv_df.show().await?;
+
+ // You can also create DataFrames from the result of sql queries
+ // and using the `enable_url_table` refer to local files directly
+ let dyn_ctx = ctx.clone().enable_url_table();
+ let csv_df = dyn_ctx
+ .sql(&format!("SELECT rating, unixtime FROM '{}'", file_path))
+ .await?;
+ csv_df.show().await?;
+
+ Ok(())
+}
+
+/// Use the DataFrame API to:
+/// 1. Read in-memory data.
+async fn read_memory(ctx: &SessionContext) -> Result<()> {
+ // define data in memory
+ let a: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d"]));
+ let b: ArrayRef = Arc::new(Int32Array::from(vec![1, 10, 10, 100]));
+ let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
+
+ // declare a table in memory. In Apache Spark API, this corresponds to createDataFrame(...).
+ ctx.register_batch("t", batch)?;
+ let df = ctx.table("t").await?;
+
+ // construct an expression corresponding to "SELECT a, b FROM t WHERE b = 10" in SQL
+ let filter = col("b").eq(lit(10));
+ let df = df.select_columns(&["a", "b"])?.filter(filter)?;
+
+ // print the results
+ df.show().await?;
+
+ Ok(())
+}
+
+/// Use the DataFrame API to:
+/// 1. Write out a DataFrame to a table
+/// 2. Write out a DataFrame to a parquet file
+/// 3. Write out a DataFrame to a csv file
+/// 4. Write out a DataFrame to a json file
+async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionError> {
+ let mut df = ctx.sql("values ('a'), ('b'), ('c')").await.unwrap();
+
+ // Ensure the column names and types match the target table
+ df = df.with_column_renamed("column1", "tablecol1").unwrap();
+
+ ctx.sql(
+ "create external table
+ test(tablecol1 varchar)
+ stored as parquet
+ location './datafusion-examples/test_table/'",
+ )
+ .await?
+ .collect()
+ .await?;
+
+ // This is equivalent to INSERT INTO test VALUES ('a'), ('b'), ('c').
+ // The behavior of write_table depends on the TableProvider's implementation
+ // of the insert_into method.
+ df.clone()
+ .write_table("test", DataFrameWriteOptions::new())
+ .await?;
+
+ df.clone()
+ .write_parquet(
+ "./datafusion-examples/test_parquet/",
+ DataFrameWriteOptions::new(),
+ None,
+ )
+ .await?;
+
+ df.clone()
+ .write_csv(
+ "./datafusion-examples/test_csv/",
+ // DataFrameWriteOptions contains options which control how data is written
+ // such as compression codec
+ DataFrameWriteOptions::new(),
+ Some(CsvOptions::default().with_compression(CompressionTypeVariant::GZIP)),
+ )
+ .await?;
+
+ df.clone()
+ .write_json(
+ "./datafusion-examples/test_json/",
+ DataFrameWriteOptions::new(),
+ None,
+ )
+ .await?;
+
+ Ok(())
+}
+
+/// This example demonstrates how to use the to_date series
+/// of functions in the DataFrame API as well as via sql.
+async fn query_to_date() -> Result<()> {
+ // define a schema.
+ let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));
+
+ // define data.
+ let batch = RecordBatch::try_new(
+ schema,
+ vec![Arc::new(StringArray::from(vec![
+ "2020-09-08T13:42:29Z",
+ "2020-09-08T13:42:29.190855-05:00",
+ "2020-08-09 12:13:29",
+ "2020-01-02",
+ ]))],
+ )?;
+
+ // declare a new context. In spark API, this corresponds to a new spark SQLsession
+ let ctx = SessionContext::new();
+
+ // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
+ ctx.register_batch("t", batch)?;
+ let df = ctx.table("t").await?;
+
+ // use to_date function to convert col 'a' to timestamp type using the default parsing
+ let df = df.with_column("a", to_date(vec![col("a")]))?;
+
+ let df = df.select_columns(&["a"])?;
+
+ // print the results
+ df.show().await?;
+
+ Ok(())
+}
+
+/// Use the DataFrame API to execute the following subquery:
+/// select c1,c2 from t1 where (select avg(t2.c2) from t2 where t1.c1 = t2.c1)>0 limit 3;
+async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
+ ctx.table("t1")
+ .await?
+ .filter(
+ scalar_subquery(Arc::new(
+ ctx.table("t2")
+ .await?
+ .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
+ .aggregate(vec![], vec![avg(col("t2.c2"))])?
+ .select(vec![avg(col("t2.c2"))])?
+ .into_unoptimized_plan(),
+ ))
+ .gt(lit(0u8)),
+ )?
+ .select(vec![col("t1.c1"), col("t1.c2")])?
+ .limit(0, Some(3))?
+ .show()
+ .await?;
+ Ok(())
+}
+
+/// Use the DataFrame API to execute the following subquery:
+/// select t1.c1, t1.c2 from t1 where t1.c2 in (select max(t2.c2) from t2 where t2.c1 > 0 ) limit 3;
+async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
+ ctx.table("t1")
+ .await?
+ .filter(in_subquery(
+ col("t1.c2"),
+ Arc::new(
+ ctx.table("t2")
+ .await?
+ .filter(col("t2.c1").gt(lit(ScalarValue::UInt8(Some(0)))))?
+ .aggregate(vec![], vec![max(col("t2.c2"))])?
+ .select(vec![max(col("t2.c2"))])?
+ .into_unoptimized_plan(),
+ ),
+ ))?
+ .select(vec![col("t1.c1"), col("t1.c2")])?
+ .limit(0, Some(3))?
+ .show()
+ .await?;
+ Ok(())
+}
+
+/// Use the DataFrame API to execute the following subquery:
+/// select t1.c1, t1.c2 from t1 where exists (select t2.c2 from t2 where t1.c1 = t2.c1) limit 3;
+async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> {
+ ctx.table("t1")
+ .await?
+ .filter(exists(Arc::new(
+ ctx.table("t2")
+ .await?
+ .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
+ .select(vec![col("t2.c2")])?
+ .into_unoptimized_plan(),
+ )))?
+ .select(vec![col("t1.c1"), col("t1.c2")])?
+ .limit(0, Some(3))?
+ .show()
+ .await?;
+ Ok(())
+}
+
+async fn register_aggregate_test_data(name: &str, ctx: &SessionContext) -> Result<()> {
+ let testdata = datafusion::test_util::arrow_test_data();
+ ctx.register_csv(
+ name,
+ &format!("{testdata}/csv/aggregate_test_100.csv"),
+ CsvReadOptions::default(),
+ )
+ .await?;
+ Ok(())
}
diff --git a/datafusion-examples/examples/dataframe_in_memory.rs b/datafusion-examples/examples/dataframe_in_memory.rs
deleted file mode 100644
index c57c38870a7e4..0000000000000
--- a/datafusion-examples/examples/dataframe_in_memory.rs
+++ /dev/null
@@ -1,60 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use datafusion::arrow::array::{Int32Array, StringArray};
-use datafusion::arrow::datatypes::{DataType, Field, Schema};
-use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::error::Result;
-use datafusion::prelude::*;
-
-/// This example demonstrates how to use the DataFrame API against in-memory data.
-#[tokio::main]
-async fn main() -> Result<()> {
- // define a schema.
- let schema = Arc::new(Schema::new(vec![
- Field::new("a", DataType::Utf8, false),
- Field::new("b", DataType::Int32, false),
- ]));
-
- // define data.
- let batch = RecordBatch::try_new(
- schema,
- vec![
- Arc::new(StringArray::from(vec!["a", "b", "c", "d"])),
- Arc::new(Int32Array::from(vec![1, 10, 10, 100])),
- ],
- )?;
-
- // declare a new context. In spark API, this corresponds to a new spark SQLsession
- let ctx = SessionContext::new();
-
- // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
- ctx.register_batch("t", batch)?;
- let df = ctx.table("t").await?;
-
- // construct an expression corresponding to "SELECT a, b FROM t WHERE b = 10" in SQL
- let filter = col("b").eq(lit(10));
-
- let df = df.select_columns(&["a", "b"])?.filter(filter)?;
-
- // print the results
- df.show().await?;
-
- Ok(())
-}
diff --git a/datafusion-examples/examples/dataframe_output.rs b/datafusion-examples/examples/dataframe_output.rs
deleted file mode 100644
index 60ca090d722d6..0000000000000
--- a/datafusion-examples/examples/dataframe_output.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::{dataframe::DataFrameWriteOptions, prelude::*};
-use datafusion_common::config::CsvOptions;
-use datafusion_common::{parsers::CompressionTypeVariant, DataFusionError};
-
-/// This example demonstrates the various methods to write out a DataFrame to local storage.
-/// See datafusion-examples/examples/external_dependency/dataframe-to-s3.rs for an example
-/// using a remote object store.
-#[tokio::main]
-async fn main() -> Result<(), DataFusionError> {
- let ctx = SessionContext::new();
-
- let mut df = ctx.sql("values ('a'), ('b'), ('c')").await.unwrap();
-
- // Ensure the column names and types match the target table
- df = df.with_column_renamed("column1", "tablecol1").unwrap();
-
- ctx.sql(
- "create external table
- test(tablecol1 varchar)
- stored as parquet
- location './datafusion-examples/test_table/'",
- )
- .await?
- .collect()
- .await?;
-
- // This is equivalent to INSERT INTO test VALUES ('a'), ('b'), ('c').
- // The behavior of write_table depends on the TableProvider's implementation
- // of the insert_into method.
- df.clone()
- .write_table("test", DataFrameWriteOptions::new())
- .await?;
-
- df.clone()
- .write_parquet(
- "./datafusion-examples/test_parquet/",
- DataFrameWriteOptions::new(),
- None,
- )
- .await?;
-
- df.clone()
- .write_csv(
- "./datafusion-examples/test_csv/",
- // DataFrameWriteOptions contains options which control how data is written
- // such as compression codec
- DataFrameWriteOptions::new(),
- Some(CsvOptions::default().with_compression(CompressionTypeVariant::GZIP)),
- )
- .await?;
-
- df.clone()
- .write_json(
- "./datafusion-examples/test_json/",
- DataFrameWriteOptions::new(),
- None,
- )
- .await?;
-
- Ok(())
-}
diff --git a/datafusion-examples/examples/dataframe_subquery.rs b/datafusion-examples/examples/dataframe_subquery.rs
deleted file mode 100644
index 3e3d0c1b5a84b..0000000000000
--- a/datafusion-examples/examples/dataframe_subquery.rs
+++ /dev/null
@@ -1,118 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow_schema::DataType;
-use std::sync::Arc;
-
-use datafusion::error::Result;
-use datafusion::functions_aggregate::average::avg;
-use datafusion::functions_aggregate::min_max::max;
-use datafusion::prelude::*;
-use datafusion::test_util::arrow_test_data;
-use datafusion_common::ScalarValue;
-
-/// This example demonstrates how to use the DataFrame API to create a subquery.
-#[tokio::main]
-async fn main() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_test_data("t1", &ctx).await?;
- register_aggregate_test_data("t2", &ctx).await?;
-
- where_scalar_subquery(&ctx).await?;
-
- where_in_subquery(&ctx).await?;
-
- where_exist_subquery(&ctx).await?;
-
- Ok(())
-}
-
-//select c1,c2 from t1 where (select avg(t2.c2) from t2 where t1.c1 = t2.c1)>0 limit 3;
-async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> {
- ctx.table("t1")
- .await?
- .filter(
- scalar_subquery(Arc::new(
- ctx.table("t2")
- .await?
- .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
- .aggregate(vec![], vec![avg(col("t2.c2"))])?
- .select(vec![avg(col("t2.c2"))])?
- .into_unoptimized_plan(),
- ))
- .gt(lit(0u8)),
- )?
- .select(vec![col("t1.c1"), col("t1.c2")])?
- .limit(0, Some(3))?
- .show()
- .await?;
- Ok(())
-}
-
-//SELECT t1.c1, t1.c2 FROM t1 WHERE t1.c2 in (select max(t2.c2) from t2 where t2.c1 > 0 ) limit 3;
-async fn where_in_subquery(ctx: &SessionContext) -> Result<()> {
- ctx.table("t1")
- .await?
- .filter(in_subquery(
- col("t1.c2"),
- Arc::new(
- ctx.table("t2")
- .await?
- .filter(col("t2.c1").gt(lit(ScalarValue::UInt8(Some(0)))))?
- .aggregate(vec![], vec![max(col("t2.c2"))])?
- .select(vec![max(col("t2.c2"))])?
- .into_unoptimized_plan(),
- ),
- ))?
- .select(vec![col("t1.c1"), col("t1.c2")])?
- .limit(0, Some(3))?
- .show()
- .await?;
- Ok(())
-}
-
-//SELECT t1.c1, t1.c2 FROM t1 WHERE EXISTS (select t2.c2 from t2 where t1.c1 = t2.c1) limit 3;
-async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> {
- ctx.table("t1")
- .await?
- .filter(exists(Arc::new(
- ctx.table("t2")
- .await?
- .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))?
- .select(vec![col("t2.c2")])?
- .into_unoptimized_plan(),
- )))?
- .select(vec![col("t1.c1"), col("t1.c2")])?
- .limit(0, Some(3))?
- .show()
- .await?;
- Ok(())
-}
-
-pub async fn register_aggregate_test_data(
- name: &str,
- ctx: &SessionContext,
-) -> Result<()> {
- let testdata = arrow_test_data();
- ctx.register_csv(
- name,
- &format!("{testdata}/csv/aggregate_test_100.csv"),
- CsvReadOptions::default(),
- )
- .await?;
- Ok(())
-}
diff --git a/datafusion-examples/examples/deserialize_to_struct.rs b/datafusion-examples/examples/deserialize_to_struct.rs
index 985cab703a5cb..5ac3ee6187d11 100644
--- a/datafusion-examples/examples/deserialize_to_struct.rs
+++ b/datafusion-examples/examples/deserialize_to_struct.rs
@@ -15,62 +15,136 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::array::AsArray;
+use arrow::array::{AsArray, PrimitiveArray};
use arrow::datatypes::{Float64Type, Int32Type};
use datafusion::error::Result;
use datafusion::prelude::*;
+use datafusion_common::assert_batches_eq;
use futures::StreamExt;
-/// This example shows that it is possible to convert query results into Rust structs .
+/// This example shows how to convert query results into Rust structs by using
+/// the Arrow APIs to convert the results into Rust native types.
+///
+/// This is a bit tricky initially as the results are returned as columns stored
+/// as [ArrayRef]
+///
+/// [ArrayRef]: arrow::array::ArrayRef
#[tokio::main]
async fn main() -> Result<()> {
- let data_list = Data::new().await?;
- println!("{data_list:#?}");
- Ok(())
-}
+ // Run a query that returns two columns of data
+ let ctx = SessionContext::new();
+ let testdata = datafusion::test_util::parquet_test_data();
+ ctx.register_parquet(
+ "alltypes_plain",
+ &format!("{testdata}/alltypes_plain.parquet"),
+ ParquetReadOptions::default(),
+ )
+ .await?;
+ let df = ctx
+ .sql("SELECT int_col, double_col FROM alltypes_plain")
+ .await?;
-#[derive(Debug)]
-struct Data {
- #[allow(dead_code)]
- int_col: i32,
- #[allow(dead_code)]
- double_col: f64,
-}
+ // print out the results showing we have an int32 and a float64 column
+ let results = df.clone().collect().await?;
+ assert_batches_eq!(
+ [
+ "+---------+------------+",
+ "| int_col | double_col |",
+ "+---------+------------+",
+ "| 0 | 0.0 |",
+ "| 1 | 10.1 |",
+ "| 0 | 0.0 |",
+ "| 1 | 10.1 |",
+ "| 0 | 0.0 |",
+ "| 1 | 10.1 |",
+ "| 0 | 0.0 |",
+ "| 1 | 10.1 |",
+ "+---------+------------+",
+ ],
+ &results
+ );
-impl Data {
- pub async fn new() -> Result> {
- // this group is almost the same as the one you find it in parquet_sql.rs
- let ctx = SessionContext::new();
+ // We will now convert the query results into a Rust struct
+ let mut stream = df.execute_stream().await?;
+ let mut list = vec![];
- let testdata = datafusion::test_util::parquet_test_data();
+ // DataFusion produces data in chunks called `RecordBatch`es which are
+ // typically 8000 rows each. This loop processes each `RecordBatch` as it is
+ // produced by the query plan and adds it to the list
+ while let Some(b) = stream.next().await.transpose()? {
+ // Each `RecordBatch` has one or more columns. Each column is stored as
+ // an `ArrayRef`. To interact with data using Rust native types we need to
+ // convert these `ArrayRef`s into concrete array types using APIs from
+ // the arrow crate.
- ctx.register_parquet(
- "alltypes_plain",
- &format!("{testdata}/alltypes_plain.parquet"),
- ParquetReadOptions::default(),
- )
- .await?;
+ // In this case, we know that each batch has two columns of the Arrow
+ // types Int32 and Float64, so first we cast the two columns to the
+ // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).:
+ let int_col: &PrimitiveArray = b.column(0).as_primitive();
+ let float_col: &PrimitiveArray = b.column(1).as_primitive();
- let df = ctx
- .sql("SELECT int_col, double_col FROM alltypes_plain")
- .await?;
+ // With PrimitiveArrays, we can access to the values as native Rust
+ // types i32 and f64, and forming the desired `Data` structs
+ for (i, f) in int_col.values().iter().zip(float_col.values()) {
+ list.push(Data {
+ int_col: *i,
+ double_col: *f,
+ })
+ }
+ }
- df.clone().show().await?;
+ // Finally, we have the results in the list of Rust structs
+ let res = format!("{list:#?}");
+ assert_eq!(
+ res,
+ r#"[
+ Data {
+ int_col: 0,
+ double_col: 0.0,
+ },
+ Data {
+ int_col: 1,
+ double_col: 10.1,
+ },
+ Data {
+ int_col: 0,
+ double_col: 0.0,
+ },
+ Data {
+ int_col: 1,
+ double_col: 10.1,
+ },
+ Data {
+ int_col: 0,
+ double_col: 0.0,
+ },
+ Data {
+ int_col: 1,
+ double_col: 10.1,
+ },
+ Data {
+ int_col: 0,
+ double_col: 0.0,
+ },
+ Data {
+ int_col: 1,
+ double_col: 10.1,
+ },
+]"#
+ );
- let mut stream = df.execute_stream().await?;
- let mut list = vec![];
- while let Some(b) = stream.next().await.transpose()? {
- let int_col = b.column(0).as_primitive::();
- let float_col = b.column(1).as_primitive::();
+ // Use the fields in the struct to avoid clippy complaints
+ let int_sum = list.iter().fold(0, |acc, x| acc + x.int_col);
+ let double_sum = list.iter().fold(0.0, |acc, x| acc + x.double_col);
+ assert_eq!(int_sum, 4);
+ assert_eq!(double_sum, 40.4);
- for (i, f) in int_col.values().iter().zip(float_col.values()) {
- list.push(Data {
- int_col: *i,
- double_col: *f,
- })
- }
- }
+ Ok(())
+}
- Ok(list)
- }
+/// This is target struct where we want the query results.
+#[derive(Debug)]
+struct Data {
+ int_col: i32,
+ double_col: f64,
}
diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs
index 85a79a1a56048..2f9b5697c243d 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/expr_api.rs
@@ -18,7 +18,7 @@
use std::collections::HashMap;
use std::sync::Arc;
-use arrow::array::{BooleanArray, Int32Array};
+use arrow::array::{BooleanArray, Int32Array, Int8Array};
use arrow::record_batch::RecordBatch;
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -28,12 +28,14 @@ use datafusion::functions_aggregate::first_last::first_value_udaf;
use datafusion::optimizer::simplify_expressions::ExprSimplifier;
use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
use datafusion::prelude::*;
+use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::{ScalarValue, ToDFSchema};
use datafusion_expr::execution_props::ExecutionProps;
use datafusion_expr::expr::BinaryExpr;
use datafusion_expr::interval_arithmetic::Interval;
use datafusion_expr::simplify::SimplifyContext;
use datafusion_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator};
+use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter;
/// This example demonstrates the DataFusion [`Expr`] API.
///
@@ -51,6 +53,7 @@ use datafusion_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator};
/// 4. Simplify expressions: [`simplify_demo`]
/// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`]
/// 6. Get the types of the expressions: [`expression_type_demo`]
+/// 7. Apply type coercion to expressions: [`type_coercion_demo`]
#[tokio::main]
async fn main() -> Result<()> {
// The easiest way to do create expressions is to use the
@@ -80,6 +83,9 @@ async fn main() -> Result<()> {
// See how to determine the data types of expressions
expression_type_demo()?;
+ // See how to type coerce expressions.
+ type_coercion_demo()?;
+
Ok(())
}
@@ -316,3 +322,103 @@ fn expression_type_demo() -> Result<()> {
Ok(())
}
+
+/// This function demonstrates how to apply type coercion to expressions, such as binary expressions.
+///
+/// In most cases, manual type coercion is not required since DataFusion handles it implicitly.
+/// However, certain projects may construct `ExecutionPlan`s directly from DataFusion logical expressions,
+/// bypassing the construction of DataFusion logical plans.
+/// Since constructing `ExecutionPlan`s from logical expressions does not automatically apply type coercion,
+/// you may need to handle type coercion manually in these cases.
+///
+/// The codes in this function shows various ways to perform type coercion on expressions:
+/// 1. Using `SessionContext::create_physical_expr`
+/// 2. Using `ExprSimplifier::coerce`
+/// 3. Using `TreeNodeRewriter::rewrite` based on `TypeCoercionRewriter`
+/// 4. Using `TreeNode::transform`
+///
+/// Note, this list may not be complete and there may be other methods to apply type coercion to expressions.
+fn type_coercion_demo() -> Result<()> {
+ // Creates a record batch for demo.
+ let df_schema = DFSchema::from_unqualified_fields(
+ vec![Field::new("a", DataType::Int8, false)].into(),
+ HashMap::new(),
+ )?;
+ let i8_array = Int8Array::from_iter_values(vec![0, 1, 2]);
+ let batch = RecordBatch::try_new(
+ Arc::new(df_schema.as_arrow().to_owned()),
+ vec![Arc::new(i8_array) as _],
+ )?;
+
+ // Constructs a binary expression for demo.
+ // By default, the literal `1` is translated into the Int32 type and cannot be directly compared with the Int8 type.
+ let expr = col("a").gt(lit(1));
+
+ // Evaluation with an expression that has not been type coerced cannot succeed.
+ let props = ExecutionProps::default();
+ let physical_expr =
+ datafusion_physical_expr::create_physical_expr(&expr, &df_schema, &props)?;
+ let e = physical_expr.evaluate(&batch).unwrap_err();
+ assert!(e
+ .find_root()
+ .to_string()
+ .contains("Invalid comparison operation: Int8 > Int32"));
+
+ // 1. Type coercion with `SessionContext::create_physical_expr` which implicitly applies type coercion before constructing the physical expr.
+ let physical_expr =
+ SessionContext::new().create_physical_expr(expr.clone(), &df_schema)?;
+ assert!(physical_expr.evaluate(&batch).is_ok());
+
+ // 2. Type coercion with `ExprSimplifier::coerce`.
+ let context = SimplifyContext::new(&props).with_schema(Arc::new(df_schema.clone()));
+ let simplifier = ExprSimplifier::new(context);
+ let coerced_expr = simplifier.coerce(expr.clone(), &df_schema)?;
+ let physical_expr = datafusion_physical_expr::create_physical_expr(
+ &coerced_expr,
+ &df_schema,
+ &props,
+ )?;
+ assert!(physical_expr.evaluate(&batch).is_ok());
+
+ // 3. Type coercion with `TypeCoercionRewriter`.
+ let coerced_expr = expr
+ .clone()
+ .rewrite(&mut TypeCoercionRewriter::new(&df_schema))?
+ .data;
+ let physical_expr = datafusion_physical_expr::create_physical_expr(
+ &coerced_expr,
+ &df_schema,
+ &props,
+ )?;
+ assert!(physical_expr.evaluate(&batch).is_ok());
+
+ // 4. Apply explicit type coercion by manually rewriting the expression
+ let coerced_expr = expr
+ .transform(|e| {
+ // Only type coerces binary expressions.
+ let Expr::BinaryExpr(e) = e else {
+ return Ok(Transformed::no(e));
+ };
+ if let Expr::Column(ref col_expr) = *e.left {
+ let field = df_schema.field_with_name(None, col_expr.name())?;
+ let cast_to_type = field.data_type();
+ let coerced_right = e.right.cast_to(cast_to_type, &df_schema)?;
+ Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr::new(
+ e.left,
+ e.op,
+ Box::new(coerced_right),
+ ))))
+ } else {
+ Ok(Transformed::no(Expr::BinaryExpr(e)))
+ }
+ })?
+ .data;
+ let physical_expr = datafusion_physical_expr::create_physical_expr(
+ &coerced_expr,
+ &df_schema,
+ &props,
+ )?;
+ assert!(physical_expr.evaluate(&batch).is_ok());
+
+ Ok(())
+}
diff --git a/datafusion-examples/examples/ffi/README.md b/datafusion-examples/examples/ffi/README.md
new file mode 100644
index 0000000000000..f29e0012f3180
--- /dev/null
+++ b/datafusion-examples/examples/ffi/README.md
@@ -0,0 +1,48 @@
+
+
+# Example FFI Usage
+
+The purpose of these crates is to provide an example of how one can use the
+DataFusion Foreign Function Interface (FFI). See [API Docs] for detailed
+usage.
+
+This example is broken into three crates.
+
+- `ffi_module_interface` is a common library to be shared by both the module
+ to be loaded and the program that will load it. It defines how the module
+ is to be structured.
+- `ffi_example_table_provider` creates a library to exposes the module.
+- `ffi_module_loader` is an example program that loads the module, gets data
+ from it, and displays this data to the user.
+
+## Building and running
+
+In order for the program to run successfully, the module to be loaded must be
+built first. This example expects both the module and the program to be
+built using the same build mode (debug or release).
+
+```shell
+cd ffi_example_table_provider
+cargo build
+cd ../ffi_module_loader
+cargo run
+```
+
+[api docs]: http://docs.rs/datafusion-ffi/latest
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
new file mode 100644
index 0000000000000..52efdb7461abe
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "ffi_example_table_provider"
+version = "0.1.0"
+edition = { workspace = true }
+publish = false
+
+[dependencies]
+abi_stable = "0.11.3"
+arrow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-schema = { workspace = true }
+datafusion = { workspace = true }
+datafusion-ffi = { workspace = true }
+ffi_module_interface = { path = "../ffi_module_interface" }
+
+[lib]
+name = "ffi_example_table_provider"
+crate-type = ["cdylib", 'rlib']
diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
new file mode 100644
index 0000000000000..c7eea8a8070b1
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait};
+use arrow_array::RecordBatch;
+use datafusion::{
+ arrow::datatypes::{DataType, Field, Schema},
+ common::record_batch,
+ datasource::MemTable,
+};
+use datafusion_ffi::table_provider::FFI_TableProvider;
+use ffi_module_interface::{TableProviderModule, TableProviderModuleRef};
+
+fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch {
+ let end_value = start_value + num_values as i32;
+ let a_vals: Vec = (start_value..end_value).collect();
+ let b_vals: Vec = a_vals.iter().map(|v| *v as f64).collect();
+
+ record_batch!(("a", Int32, a_vals), ("b", Float64, b_vals)).unwrap()
+}
+
+/// Here we only wish to create a simple table provider as an example.
+/// We create an in-memory table and convert it to it's FFI counterpart.
+extern "C" fn construct_simple_table_provider() -> FFI_TableProvider {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("a", DataType::Int32, true),
+ Field::new("b", DataType::Float64, true),
+ ]));
+
+ // It is useful to create these as multiple record batches
+ // so that we can demonstrate the FFI stream.
+ let batches = vec![
+ create_record_batch(1, 5),
+ create_record_batch(6, 1),
+ create_record_batch(7, 5),
+ ];
+
+ let table_provider = MemTable::try_new(schema, vec![batches]).unwrap();
+
+ FFI_TableProvider::new(Arc::new(table_provider), true)
+}
+
+#[export_root_module]
+/// This defines the entry point for using the module.
+pub fn get_simple_memory_table() -> TableProviderModuleRef {
+ TableProviderModule {
+ create_table: construct_simple_table_provider,
+ }
+ .leak_into_prefix()
+}
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
new file mode 100644
index 0000000000000..612a219324763
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "ffi_module_interface"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+abi_stable = "0.11.3"
+datafusion-ffi = { workspace = true }
diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
new file mode 100644
index 0000000000000..88690e9297135
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use abi_stable::{
+ declare_root_module_statics,
+ library::{LibraryError, RootModule},
+ package_version_strings,
+ sabi_types::VersionStrings,
+ StableAbi,
+};
+use datafusion_ffi::table_provider::FFI_TableProvider;
+
+#[repr(C)]
+#[derive(StableAbi)]
+#[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))]
+/// This struct defines the module interfaces. It is to be shared by
+/// both the module loading program and library that implements the
+/// module. It is possible to move this definition into the loading
+/// program and reference it in the modules, but this example shows
+/// how a user may wish to separate these concerns.
+pub struct TableProviderModule {
+ /// Constructs the table provider
+ pub create_table: extern "C" fn() -> FFI_TableProvider,
+}
+
+impl RootModule for TableProviderModuleRef {
+ declare_root_module_statics! {TableProviderModuleRef}
+ const BASE_NAME: &'static str = "ffi_example_table_provider";
+ const NAME: &'static str = "ffi_example_table_provider";
+ const VERSION_STRINGS: VersionStrings = package_version_strings!();
+
+ fn initialization(self) -> Result {
+ Ok(self)
+ }
+}
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
new file mode 100644
index 0000000000000..028a366aab1c0
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "ffi_module_loader"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+abi_stable = "0.11.3"
+datafusion = { workspace = true }
+datafusion-ffi = { workspace = true }
+ffi_module_interface = { path = "../ffi_module_interface" }
+tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
new file mode 100644
index 0000000000000..6e376ca866e8f
--- /dev/null
+++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion::{
+ error::{DataFusionError, Result},
+ prelude::SessionContext,
+};
+
+use abi_stable::library::{development_utils::compute_library_path, RootModule};
+use datafusion_ffi::table_provider::ForeignTableProvider;
+use ffi_module_interface::TableProviderModuleRef;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ // Find the location of the library. This is specific to the build environment,
+ // so you will need to change the approach here based on your use case.
+ let target: &std::path::Path = "../../../../target/".as_ref();
+ let library_path = compute_library_path::(target)
+ .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+ // Load the module
+ let table_provider_module =
+ TableProviderModuleRef::load_from_directory(&library_path)
+ .map_err(|e| DataFusionError::External(Box::new(e)))?;
+
+ // By calling the code below, the table provided will be created within
+ // the module's code.
+ let ffi_table_provider =
+ table_provider_module
+ .create_table()
+ .ok_or(DataFusionError::NotImplemented(
+ "External table provider failed to implement create_table".to_string(),
+ ))?();
+
+ // In order to access the table provider within this executable, we need to
+ // turn it into a `ForeignTableProvider`.
+ let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into();
+
+ let ctx = SessionContext::new();
+
+ // Display the data to show the full cycle works.
+ ctx.register_table("external_table", Arc::new(foreign_table_provider))?;
+ let df = ctx.table("external_table").await?;
+ df.show().await?;
+
+ Ok(())
+}
diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/flight_server.rs
index f9d1b8029f04b..cc5f43746ddfb 100644
--- a/datafusion-examples/examples/flight/flight_server.rs
+++ b/datafusion-examples/examples/flight/flight_server.rs
@@ -105,7 +105,7 @@ impl FlightService for FlightServiceImpl {
}
// add an initial FlightData message that sends schema
- let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default();
+ let options = arrow::ipc::writer::IpcWriteOptions::default();
let schema_flight_data = SchemaAsIpc::new(&schema, &options);
let mut flights = vec![FlightData::from(schema_flight_data)];
diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/function_factory.rs
index f57b3bf604048..58ffa060ebaad 100644
--- a/datafusion-examples/examples/function_factory.rs
+++ b/datafusion-examples/examples/function_factory.rs
@@ -26,7 +26,9 @@ use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::{exec_err, internal_err, DataFusionError};
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
-use datafusion_expr::{CreateFunction, Expr, ScalarUDF, ScalarUDFImpl, Signature};
+use datafusion_expr::{
+ CreateFunction, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
+};
/// This example shows how to utilize [FunctionFactory] to implement simple
/// SQL-macro like functions using a `CREATE FUNCTION` statement. The same
@@ -34,7 +36,7 @@ use datafusion_expr::{CreateFunction, Expr, ScalarUDF, ScalarUDFImpl, Signature}
///
/// Apart from [FunctionFactory], this example covers
/// [ScalarUDFImpl::simplify()] which is often used at the same time, to replace
-/// a function call with another expression at rutime.
+/// a function call with another expression at runtime.
///
/// This example is rather simple and does not cover all cases required for a
/// real implementation.
@@ -121,7 +123,7 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
&self.name
}
- fn signature(&self) -> &datafusion_expr::Signature {
+ fn signature(&self) -> &Signature {
&self.signature
}
@@ -132,9 +134,9 @@ impl ScalarUDFImpl for ScalarFunctionWrapper {
Ok(self.return_type.clone())
}
- fn invoke(
+ fn invoke_with_args(
&self,
- _args: &[datafusion_expr::ColumnarValue],
+ _args: ScalarFunctionArgs,
) -> Result {
// Since this function is always simplified to another expression, it
// should never actually be invoked
diff --git a/datafusion-examples/examples/json_opener.rs b/datafusion-examples/examples/json_opener.rs
deleted file mode 100644
index 7bc431c5c5eef..0000000000000
--- a/datafusion-examples/examples/json_opener.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::{sync::Arc, vec};
-
-use arrow_schema::{DataType, Field, Schema};
-use datafusion::{
- assert_batches_eq,
- datasource::{
- file_format::file_compression_type::FileCompressionType,
- listing::PartitionedFile,
- object_store::ObjectStoreUrl,
- physical_plan::{FileScanConfig, FileStream, JsonOpener},
- },
- error::Result,
- physical_plan::metrics::ExecutionPlanMetricsSet,
-};
-
-use futures::StreamExt;
-use object_store::ObjectStore;
-
-/// This example demonstrates a scanning against an Arrow data source (JSON) and
-/// fetching results
-#[tokio::main]
-async fn main() -> Result<()> {
- let object_store = object_store::memory::InMemory::new();
- let path = object_store::path::Path::from("demo.json");
- let data = bytes::Bytes::from(
- r#"{"num":5,"str":"test"}
- {"num":2,"str":"hello"}
- {"num":4,"str":"foo"}"#,
- );
- object_store.put(&path, data.into()).await.unwrap();
-
- let schema = Arc::new(Schema::new(vec![
- Field::new("num", DataType::Int64, false),
- Field::new("str", DataType::Utf8, false),
- ]));
-
- let projected = Arc::new(schema.clone().project(&[1, 0])?);
-
- let opener = JsonOpener::new(
- 8192,
- projected,
- FileCompressionType::UNCOMPRESSED,
- Arc::new(object_store),
- );
-
- let scan_config =
- FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema.clone())
- .with_projection(Some(vec![1, 0]))
- .with_limit(Some(5))
- .with_file(PartitionedFile::new(path.to_string(), 10));
-
- let result =
- FileStream::new(&scan_config, 0, opener, &ExecutionPlanMetricsSet::new())
- .unwrap()
- .map(|b| b.unwrap())
- .collect::>()
- .await;
- assert_batches_eq!(
- &[
- "+-------+-----+",
- "| str | num |",
- "+-------+-----+",
- "| test | 5 |",
- "| hello | 2 |",
- "| foo | 4 |",
- "+-------+-----+",
- ],
- &result
- );
- Ok(())
-}
diff --git a/datafusion-examples/examples/memtable.rs b/datafusion-examples/examples/memtable.rs
deleted file mode 100644
index 5cce578039e74..0000000000000
--- a/datafusion-examples/examples/memtable.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::arrow::array::{UInt64Array, UInt8Array};
-use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::datasource::MemTable;
-use datafusion::error::Result;
-use datafusion::prelude::SessionContext;
-use std::sync::Arc;
-use std::time::Duration;
-use tokio::time::timeout;
-
-/// This example demonstrates executing a simple query against a Memtable
-#[tokio::main]
-async fn main() -> Result<()> {
- let mem_table = create_memtable()?;
-
- // create local execution context
- let ctx = SessionContext::new();
-
- // Register the in-memory table containing the data
- ctx.register_table("users", Arc::new(mem_table))?;
-
- let dataframe = ctx.sql("SELECT * FROM users;").await?;
-
- timeout(Duration::from_secs(10), async move {
- let result = dataframe.collect().await.unwrap();
- let record_batch = result.first().unwrap();
-
- assert_eq!(1, record_batch.column(0).len());
- dbg!(record_batch.columns());
- })
- .await
- .unwrap();
-
- Ok(())
-}
-
-fn create_memtable() -> Result {
- MemTable::try_new(get_schema(), vec![vec![create_record_batch()?]])
-}
-
-fn create_record_batch() -> Result {
- let id_array = UInt8Array::from(vec![1]);
- let account_array = UInt64Array::from(vec![9000]);
-
- Ok(RecordBatch::try_new(
- get_schema(),
- vec![Arc::new(id_array), Arc::new(account_array)],
- )
- .unwrap())
-}
-
-fn get_schema() -> SchemaRef {
- SchemaRef::new(Schema::new(vec![
- Field::new("id", DataType::UInt8, false),
- Field::new("bank_account", DataType::UInt64, true),
- ]))
-}
diff --git a/datafusion-examples/examples/optimizer_rule.rs b/datafusion-examples/examples/optimizer_rule.rs
index 5f18bfe244449..9fd8b0133481a 100644
--- a/datafusion-examples/examples/optimizer_rule.rs
+++ b/datafusion-examples/examples/optimizer_rule.rs
@@ -146,7 +146,7 @@ impl MyOptimizerRule {
// Closure called for each sub tree
match expr {
Expr::BinaryExpr(binary_expr) if is_binary_eq(&binary_expr) => {
- // destruture the expression
+ // destructure the expression
let BinaryExpr { left, op: _, right } = binary_expr;
// rewrite to `my_eq(left, right)`
let udf = ScalarUDF::new_from_impl(MyEq::new());
@@ -205,7 +205,11 @@ impl ScalarUDFImpl for MyEq {
Ok(DataType::Boolean)
}
- fn invoke(&self, _args: &[ColumnarValue]) -> Result {
+ fn invoke_batch(
+ &self,
+ _args: &[ColumnarValue],
+ _number_rows: usize,
+ ) -> Result {
// this example simply returns "true" which is not what a real
// implementation would do.
Ok(ColumnarValue::from(ScalarValue::from(true)))
diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs
deleted file mode 100644
index b0d3922a32789..0000000000000
--- a/datafusion-examples/examples/parquet_sql_multiple_files.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::path::Path;
-use std::sync::Arc;
-
-use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::listing::ListingOptions;
-use datafusion::prelude::*;
-
-use object_store::local::LocalFileSystem;
-
-/// This example demonstrates executing a simple query against an Arrow data source (a directory
-/// with multiple Parquet files) and fetching results. The query is run twice, once showing
-/// how to used `register_listing_table` with an absolute path, and once registering an
-/// ObjectStore to use a relative path.
-#[tokio::main]
-async fn main() -> Result<(), Box> {
- // create local execution context
- let ctx = SessionContext::new();
-
- let test_data = datafusion::test_util::parquet_test_data();
-
- // Configure listing options
- let file_format = ParquetFormat::default().with_enable_pruning(true);
- let listing_options = ListingOptions::new(Arc::new(file_format))
- // This is a workaround for this example since `test_data` contains
- // many different parquet different files,
- // in practice use FileType::PARQUET.get_ext().
- .with_file_extension("alltypes_plain.parquet");
-
- // First example were we use an absolute path, which requires no additional setup.
- ctx.register_listing_table(
- "my_table",
- &format!("file://{test_data}/"),
- listing_options.clone(),
- None,
- None,
- )
- .await
- .unwrap();
-
- // execute the query
- let df = ctx
- .sql(
- "SELECT * \
- FROM my_table \
- LIMIT 1",
- )
- .await?;
-
- // print the results
- df.show().await?;
-
- // Second example were we temporarily move into the test data's parent directory and
- // simulate a relative path, this requires registering an ObjectStore.
- let cur_dir = std::env::current_dir()?;
-
- let test_data_path = Path::new(&test_data);
- let test_data_path_parent = test_data_path
- .parent()
- .ok_or("test_data path needs a parent")?;
-
- std::env::set_current_dir(test_data_path_parent)?;
-
- let local_fs = Arc::new(LocalFileSystem::default());
-
- let u = url::Url::parse("file://./")?;
- ctx.register_object_store(&u, local_fs);
-
- // Register a listing table - this will use all files in the directory as data sources
- // for the query
- ctx.register_listing_table(
- "relative_table",
- "./data",
- listing_options.clone(),
- None,
- None,
- )
- .await?;
-
- // execute the query
- let df = ctx
- .sql(
- "SELECT * \
- FROM relative_table \
- LIMIT 1",
- )
- .await?;
-
- // print the results
- df.show().await?;
-
- // Reset the current directory
- std::env::set_current_dir(cur_dir)?;
-
- Ok(())
-}
diff --git a/datafusion-examples/examples/parse_sql_expr.rs b/datafusion-examples/examples/parse_sql_expr.rs
index e23e5accae397..d8f0778e19e36 100644
--- a/datafusion-examples/examples/parse_sql_expr.rs
+++ b/datafusion-examples/examples/parse_sql_expr.rs
@@ -121,11 +121,11 @@ async fn query_parquet_demo() -> Result<()> {
assert_batches_eq!(
&[
- "+------------+----------------------+",
- "| double_col | sum(?table?.int_col) |",
- "+------------+----------------------+",
- "| 10.1 | 4 |",
- "+------------+----------------------+",
+ "+------------+-------------+",
+ "| double_col | sum_int_col |",
+ "+------------+-------------+",
+ "| 10.1 | 4 |",
+ "+------------+-------------+",
],
&result
);
diff --git a/datafusion-examples/examples/plan_to_sql.rs b/datafusion-examples/examples/plan_to_sql.rs
index 8ea7c2951223d..cf1202498416a 100644
--- a/datafusion-examples/examples/plan_to_sql.rs
+++ b/datafusion-examples/examples/plan_to_sql.rs
@@ -16,11 +16,25 @@
// under the License.
use datafusion::error::Result;
-
+use datafusion::logical_expr::sqlparser::ast::Statement;
use datafusion::prelude::*;
use datafusion::sql::unparser::expr_to_sql;
+use datafusion_common::DFSchemaRef;
+use datafusion_expr::{
+ Extension, LogicalPlan, LogicalPlanBuilder, UserDefinedLogicalNode,
+ UserDefinedLogicalNodeCore,
+};
+use datafusion_sql::unparser::ast::{
+ DerivedRelationBuilder, QueryBuilder, RelationBuilder, SelectBuilder,
+};
use datafusion_sql::unparser::dialect::CustomDialectBuilder;
+use datafusion_sql::unparser::extension_unparser::UserDefinedLogicalNodeUnparser;
+use datafusion_sql::unparser::extension_unparser::{
+ UnparseToStatementResult, UnparseWithinStatementResult,
+};
use datafusion_sql::unparser::{plan_to_sql, Unparser};
+use std::fmt;
+use std::sync::Arc;
/// This example demonstrates the programmatic construction of SQL strings using
/// the DataFusion Expr [`Expr`] and LogicalPlan [`LogicalPlan`] API.
@@ -44,6 +58,10 @@ use datafusion_sql::unparser::{plan_to_sql, Unparser};
///
/// 5. [`round_trip_plan_to_sql_demo`]: Create a logical plan from a SQL string, modify it using the
/// DataFrames API and convert it back to a sql string.
+///
+/// 6. [`unparse_my_logical_plan_as_statement`]: Create a custom logical plan and unparse it as a statement.
+///
+/// 7. [`unparse_my_logical_plan_as_subquery`]: Create a custom logical plan and unparse it as a subquery.
#[tokio::main]
async fn main() -> Result<()> {
@@ -53,6 +71,8 @@ async fn main() -> Result<()> {
simple_expr_to_sql_demo_escape_mysql_style()?;
simple_plan_to_sql_demo().await?;
round_trip_plan_to_sql_demo().await?;
+ unparse_my_logical_plan_as_statement().await?;
+ unparse_my_logical_plan_as_subquery().await?;
Ok(())
}
@@ -65,7 +85,7 @@ fn simple_expr_to_sql_demo() -> Result<()> {
Ok(())
}
-/// DataFusioon can remove parentheses when converting an expression to SQL.
+/// DataFusion can remove parentheses when converting an expression to SQL.
/// Note that output is intended for humans, not for other SQL engines,
/// as difference in precedence rules can cause expressions to be parsed differently.
fn simple_expr_to_pretty_sql_demo() -> Result<()> {
@@ -152,3 +172,144 @@ async fn round_trip_plan_to_sql_demo() -> Result<()> {
Ok(())
}
+
+#[derive(Debug, PartialEq, Eq, Hash, PartialOrd)]
+struct MyLogicalPlan {
+ input: LogicalPlan,
+}
+
+impl UserDefinedLogicalNodeCore for MyLogicalPlan {
+ fn name(&self) -> &str {
+ "MyLogicalPlan"
+ }
+
+ fn inputs(&self) -> Vec<&LogicalPlan> {
+ vec![&self.input]
+ }
+
+ fn schema(&self) -> &DFSchemaRef {
+ self.input.schema()
+ }
+
+ fn expressions(&self) -> Vec {
+ vec![]
+ }
+
+ fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "MyLogicalPlan")
+ }
+
+ fn with_exprs_and_inputs(
+ &self,
+ _exprs: Vec,
+ inputs: Vec,
+ ) -> Result {
+ Ok(MyLogicalPlan {
+ input: inputs.into_iter().next().unwrap(),
+ })
+ }
+}
+
+struct PlanToStatement {}
+impl UserDefinedLogicalNodeUnparser for PlanToStatement {
+ fn unparse_to_statement(
+ &self,
+ node: &dyn UserDefinedLogicalNode,
+ unparser: &Unparser,
+ ) -> Result {
+ if let Some(plan) = node.as_any().downcast_ref::() {
+ let input = unparser.plan_to_sql(&plan.input)?;
+ Ok(UnparseToStatementResult::Modified(input))
+ } else {
+ Ok(UnparseToStatementResult::Unmodified)
+ }
+ }
+}
+
+/// This example demonstrates how to unparse a custom logical plan as a statement.
+/// The custom logical plan is a simple extension of the logical plan that reads from a parquet file.
+/// It can be unparse as a statement that reads from the same parquet file.
+async fn unparse_my_logical_plan_as_statement() -> Result<()> {
+ let ctx = SessionContext::new();
+ let testdata = datafusion::test_util::parquet_test_data();
+ let inner_plan = ctx
+ .read_parquet(
+ &format!("{testdata}/alltypes_plain.parquet"),
+ ParquetReadOptions::default(),
+ )
+ .await?
+ .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+ .into_unoptimized_plan();
+
+ let node = Arc::new(MyLogicalPlan { input: inner_plan });
+
+ let my_plan = LogicalPlan::Extension(Extension { node });
+ let unparser =
+ Unparser::default().with_extension_unparsers(vec![Arc::new(PlanToStatement {})]);
+ let sql = unparser.plan_to_sql(&my_plan)?.to_string();
+ assert_eq!(
+ sql,
+ r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""#
+ );
+ Ok(())
+}
+
+struct PlanToSubquery {}
+impl UserDefinedLogicalNodeUnparser for PlanToSubquery {
+ fn unparse(
+ &self,
+ node: &dyn UserDefinedLogicalNode,
+ unparser: &Unparser,
+ _query: &mut Option<&mut QueryBuilder>,
+ _select: &mut Option<&mut SelectBuilder>,
+ relation: &mut Option<&mut RelationBuilder>,
+ ) -> Result {
+ if let Some(plan) = node.as_any().downcast_ref::() {
+ let Statement::Query(input) = unparser.plan_to_sql(&plan.input)? else {
+ return Ok(UnparseWithinStatementResult::Unmodified);
+ };
+ let mut derived_builder = DerivedRelationBuilder::default();
+ derived_builder.subquery(input);
+ derived_builder.lateral(false);
+ if let Some(rel) = relation {
+ rel.derived(derived_builder);
+ }
+ }
+ Ok(UnparseWithinStatementResult::Modified)
+ }
+}
+
+/// This example demonstrates how to unparse a custom logical plan as a subquery.
+/// The custom logical plan is a simple extension of the logical plan that reads from a parquet file.
+/// It can be unparse as a subquery that reads from the same parquet file, with some columns projected.
+async fn unparse_my_logical_plan_as_subquery() -> Result<()> {
+ let ctx = SessionContext::new();
+ let testdata = datafusion::test_util::parquet_test_data();
+ let inner_plan = ctx
+ .read_parquet(
+ &format!("{testdata}/alltypes_plain.parquet"),
+ ParquetReadOptions::default(),
+ )
+ .await?
+ .select_columns(&["id", "int_col", "double_col", "date_string_col"])?
+ .into_unoptimized_plan();
+
+ let node = Arc::new(MyLogicalPlan { input: inner_plan });
+
+ let my_plan = LogicalPlan::Extension(Extension { node });
+ let plan = LogicalPlanBuilder::from(my_plan)
+ .project(vec![
+ col("id").alias("my_id"),
+ col("int_col").alias("my_int"),
+ ])?
+ .build()?;
+ let unparser =
+ Unparser::default().with_extension_unparsers(vec![Arc::new(PlanToSubquery {})]);
+ let sql = unparser.plan_to_sql(&plan)?.to_string();
+ assert_eq!(
+ sql,
+ "SELECT \"?table?\".id AS my_id, \"?table?\".int_col AS my_int FROM \
+ (SELECT \"?table?\".id, \"?table?\".int_col, \"?table?\".double_col, \"?table?\".date_string_col FROM \"?table?\")",
+ );
+ Ok(())
+}
diff --git a/datafusion-examples/examples/regexp.rs b/datafusion-examples/examples/regexp.rs
index 02e74bae22af7..5419efd2faea2 100644
--- a/datafusion-examples/examples/regexp.rs
+++ b/datafusion-examples/examples/regexp.rs
@@ -148,7 +148,7 @@ async fn main() -> Result<()> {
// invalid flags will result in an error
let result = ctx
- .sql(r"select regexp_like('\b4(?!000)\d\d\d\b', 4010, 'g')")
+ .sql(r"select regexp_like('\b4(?!000)\d\d\d\b', '4010', 'g')")
.await?
.collect()
.await;
diff --git a/datafusion-examples/examples/remote_catalog.rs b/datafusion-examples/examples/remote_catalog.rs
new file mode 100644
index 0000000000000..38629328d71c4
--- /dev/null
+++ b/datafusion-examples/examples/remote_catalog.rs
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// This example shows how to implement the DataFusion [`CatalogProvider`] API
+/// for catalogs that are remote (require network access) and/or offer only
+/// asynchronous APIs such as [Polaris], [Unity], and [Hive].
+///
+/// Integrating with this catalogs is a bit more complex than with local
+/// catalogs because calls like `ctx.sql("SELECT * FROM db.schm.tbl")` may need
+/// to perform remote network requests, but many Catalog APIs are synchronous.
+/// See the documentation on [`CatalogProvider`] for more details.
+///
+/// [`CatalogProvider`]: datafusion_catalog::CatalogProvider
+///
+/// [Polaris]: https://github.com/apache/polaris
+/// [Unity]: https://github.com/unitycatalog/unitycatalog
+/// [Hive]: https://hive.apache.org/
+use arrow::array::record_batch;
+use arrow_schema::{Field, Fields, Schema, SchemaRef};
+use async_trait::async_trait;
+use datafusion::catalog::TableProvider;
+use datafusion::common::Result;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_plan::memory::MemoryExec;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::prelude::{DataFrame, SessionContext};
+use datafusion_catalog::{AsyncSchemaProvider, Session};
+use datafusion_common::{assert_batches_eq, internal_datafusion_err, plan_err};
+use datafusion_expr::{Expr, TableType};
+use futures::TryStreamExt;
+use std::any::Any;
+use std::sync::Arc;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+ // As always, we create a session context to interact with DataFusion
+ let ctx = SessionContext::new();
+
+ // Make a connection to the remote catalog, asynchronously, and configure it
+ let remote_catalog_interface = Arc::new(RemoteCatalogInterface::connect().await?);
+
+ // Create an adapter to provide the AsyncSchemaProvider interface to DataFusion
+ // based on our remote catalog interface
+ let remote_catalog_adapter = RemoteCatalogDatafusionAdapter(remote_catalog_interface);
+
+ // Here is a query that selects data from a table in the remote catalog.
+ let sql = "SELECT * from remote_schema.remote_table";
+
+ // The `SessionContext::sql` interface is async, but it does not
+ // support asynchronous access to catalogs, so we cannot register our schema provider
+ // directly and the following query fails to find our table.
+ let results = ctx.sql(sql).await;
+ assert_eq!(
+ results.unwrap_err().to_string(),
+ "Error during planning: table 'datafusion.remote_schema.remote_table' not found"
+ );
+
+ // Instead, to use a remote catalog, we must use lower level APIs on
+ // SessionState (what `SessionContext::sql` does internally).
+ let state = ctx.state();
+
+ // First, parse the SQL (but don't plan it / resolve any table references)
+ let dialect = state.config().options().sql_parser.dialect.as_str();
+ let statement = state.sql_to_statement(sql, dialect)?;
+
+ // Find all `TableReferences` in the parsed queries. These correspond to the
+ // tables referred to by the query (in this case
+ // `remote_schema.remote_table`)
+ let references = state.resolve_table_references(&statement)?;
+
+ // Now we can asynchronously resolve the table references to get a cached catalog
+ // that we can use for our query
+ let resolved_catalog = remote_catalog_adapter
+ .resolve(&references, state.config(), "datafusion", "remote_schema")
+ .await?;
+
+ // This resolved catalog only makes sense for this query and so we create a clone
+ // of the session context with the resolved catalog
+ let query_ctx = ctx.clone();
+
+ query_ctx
+ .catalog("datafusion")
+ .ok_or_else(|| internal_datafusion_err!("default catalog was not installed"))?
+ .register_schema("remote_schema", resolved_catalog)?;
+
+ // We can now continue planning the query with this new query-specific context that
+ // contains our cached catalog
+ let query_state = query_ctx.state();
+
+ let plan = query_state.statement_to_plan(statement).await?;
+ let results = DataFrame::new(state, plan).collect().await?;
+ assert_batches_eq!(
+ [
+ "+----+-------+",
+ "| id | name |",
+ "+----+-------+",
+ "| 1 | alpha |",
+ "| 2 | beta |",
+ "| 3 | gamma |",
+ "+----+-------+",
+ ],
+ &results
+ );
+
+ Ok(())
+}
+
+/// This is an example of an API that interacts with a remote catalog.
+///
+/// Specifically, its APIs are all `async` and thus can not be used by
+/// [`SchemaProvider`] or [`TableProvider`] directly.
+#[derive(Debug)]
+struct RemoteCatalogInterface {}
+
+impl RemoteCatalogInterface {
+ /// Establish a connection to the remote catalog
+ pub async fn connect() -> Result {
+ // In a real implementation this method might connect to a remote
+ // catalog, validate credentials, cache basic information, etc
+ Ok(Self {})
+ }
+
+ /// Fetches information for a specific table
+ pub async fn table_info(&self, name: &str) -> Result