diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f76873ef77ae..f492b2e31d0b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -23,7 +23,6 @@ on: pull_request: jobs: - # build the library, a compilation step used by multiple steps below linux-build-lib: name: Build Libraries on AMD64 Rust ${{ matrix.rust }} @@ -61,17 +60,19 @@ jobs: rustup component add rustfmt - name: Build Workspace run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo build + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # Ballista is currently not part of the main workspace so requires a separate build step - name: Build Ballista run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cd ballista/rust # snmalloc requires cmake so build without default features cargo build --no-default-features + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # test the crate linux-test: @@ -111,8 +112,6 @@ jobs: rustup component add rustfmt - name: Run tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # run tests on all workspace members with default feature list @@ -122,16 +121,69 @@ jobs: cargo test --no-default-features cargo run --example csv_sql cargo run --example parquet_sql + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # Ballista is currently not part of the main workspace so requires a separate test step - name: Run Ballista tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data cd ballista/rust # snmalloc requires cmake so build without default features cargo test --no-default-features + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" + + integration-test: + name: "Integration Test" + needs: [linux-build-lib] + runs-on: ubuntu-latest + services: + postgres: + image: postgres:13 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: db_test + ports: + - 5432/tcp + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install --upgrade numpy==1.20.3 pandas==1.2.4 + - name: Allow access of psql + run: | + # make sure psql can access the server + echo "$POSTGRES_HOST:$POSTGRES_PORT:$POSTGRES_DB:$POSTGRES_USER:$POSTGRES_PASSWORD" | tee ~/.pgpass + chmod 0600 ~/.pgpass + psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'select now() as now' + env: + POSTGRES_HOST: localhost + POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} + POSTGRES_DB: db_test + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + - name: Build datafusion-cli + run: cargo build --bin datafusion-cli + - name: Test Psql Parity + run: python -m unittest -v integration-tests/test_psql_parity.py + env: + POSTGRES_HOST: localhost + POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} + POSTGRES_DB: db_test + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres windows-and-macos: name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} @@ -156,9 +208,10 @@ jobs: run: | export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data - # do not produce debug symbols to keep memory usage down - export RUSTFLAGS="-C debuginfo=0" cargo test + env: + # do not produce debug symbols to keep memory usage down + RUSTFLAGS: "-C debuginfo=0" lint: name: Lint @@ -212,9 +265,10 @@ jobs: rustup component add rustfmt clippy - name: Run clippy run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo clippy --all-targets --workspace -- -D warnings + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" miri-checks: name: MIRI @@ -242,9 +296,9 @@ jobs: - name: Run Miri Checks env: RUST_BACKTRACE: full - RUST_LOG: 'trace' + RUST_LOG: "trace" + MIRIFLAGS: "-Zmiri-disable-isolation" run: | - export MIRIFLAGS="-Zmiri-disable-isolation" cargo miri setup cargo clean # Ignore MIRI errors until we can get a clean run @@ -275,9 +329,6 @@ jobs: key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- - name: Run coverage run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data @@ -285,6 +336,9 @@ jobs: # see https://github.com/xd009642/tarpaulin/issues/618 cargo install --version 0.16.0 cargo-tarpaulin cargo tarpaulin --out Xml + env: + CARGO_HOME: "/home/runner/.cargo" + CARGO_TARGET_DIR: "/home/runner/target" - name: Report coverage continue-on-error: true run: bash <(curl -s https://codecov.io/bash) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index f36b5d93d21f..5b35880580b2 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -20,7 +20,10 @@ use clap::{crate_version, App, Arg}; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion_cli::{print_format::PrintFormat, PrintOptions}; +use datafusion_cli::{ + print_format::{all_print_formats, PrintFormat}, + PrintOptions, +}; use rustyline::Editor; use std::env; use std::fs::File; @@ -63,14 +66,22 @@ pub async fn main() { ) .arg( Arg::with_name("format") - .help("Output format (possible values: table, csv, tsv, json)") + .help("Output format") .long("format") .default_value("table") - .validator(is_valid_format) + .possible_values( + &all_print_formats() + .iter() + .map(|format| format.to_string()) + .collect::>() + .iter() + .map(|i| i.as_str()) + .collect::>(), + ) .takes_value(true), ) .arg( - Arg::with_name("quite") + Arg::with_name("quiet") .help("Reduce printing other than the results and work quietly") .short("q") .long("quiet") @@ -189,14 +200,6 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintO rl.save_history(".history").ok(); } -fn is_valid_format(format: String) -> std::result::Result<(), String> { - if format.parse::().is_ok() { - Ok(()) - } else { - Err(format!("Format '{}' not supported", format)) - } -} - fn is_valid_file(dir: String) -> std::result::Result<(), String> { if Path::new(&dir).is_file() { Ok(()) diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs index 85caaa3c5276..c7aa06149678 100644 --- a/datafusion-cli/src/print_format.rs +++ b/datafusion-cli/src/print_format.rs @@ -21,6 +21,7 @@ use arrow::json::ArrayWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::util::pretty; use datafusion::error::{DataFusionError, Result}; +use std::fmt; use std::str::FromStr; /// Allow records to be printed in different formats @@ -32,6 +33,16 @@ pub enum PrintFormat { Json, } +/// returns all print formats +pub fn all_print_formats() -> Vec { + vec![ + PrintFormat::Csv, + PrintFormat::Tsv, + PrintFormat::Table, + PrintFormat::Json, + ] +} + impl FromStr for PrintFormat { type Err = (); fn from_str(s: &str) -> std::result::Result { @@ -45,6 +56,17 @@ impl FromStr for PrintFormat { } } +impl fmt::Display for PrintFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Self::Csv => write!(f, "csv"), + Self::Tsv => write!(f, "tsv"), + Self::Table => write!(f, "table"), + Self::Json => write!(f, "json"), + } + } +} + fn print_batches_to_json(batches: &[RecordBatch]) -> Result { let mut bytes = vec![]; { @@ -108,6 +130,14 @@ mod tests { assert_eq!(PrintFormat::Table, format); } + #[test] + fn test_to_str() { + assert_eq!("csv", PrintFormat::Csv.to_string()); + assert_eq!("table", PrintFormat::Table.to_string()); + assert_eq!("tsv", PrintFormat::Tsv.to_string()); + assert_eq!("json", PrintFormat::Json.to_string()); + } + #[test] fn test_from_str_failure() { assert_eq!(true, "pretty".parse::().is_err()); diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index 27605b2e98c0..920287243677 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -25,7 +25,7 @@ The DataFusion CLI is a command-line interactive SQL utility that allows queries Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/). -```sh +```bash git clone https://github.com/apache/arrow-datafusion cd arrow-datafusion/datafusion-cli cargo run --release @@ -35,7 +35,7 @@ cargo run --release Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work. -```sh +```bash git clone https://github.com/apache/arrow-datafusion cd arrow-datafusion docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli @@ -64,7 +64,7 @@ Type `exit` or `quit` to exit the CLI. Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files. ```sql -CREATE EXTERNAL TABLE taxi +CREATE EXTERNAL TABLE taxi STORED AS PARQUET LOCATION '/mnt/nyctaxi/tripdata.parquet'; ``` diff --git a/integration-tests/__init__.py b/integration-tests/__init__.py new file mode 100644 index 000000000000..8516388ee621 --- /dev/null +++ b/integration-tests/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/integration-tests/sqls/simple_math_expressions.sql b/integration-tests/sqls/simple_math_expressions.sql new file mode 100644 index 000000000000..504689d51130 --- /dev/null +++ b/integration-tests/sqls/simple_math_expressions.sql @@ -0,0 +1,22 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + abs(-1.1) as abs, + exp(2.0) as exp, + sin(3.0) as sin, + cos(4.0) as cos, + tan(5.0) as tan; diff --git a/integration-tests/sqls/simple_select.sql b/integration-tests/sqls/simple_select.sql new file mode 100644 index 000000000000..78f54bdd8ece --- /dev/null +++ b/integration-tests/sqls/simple_select.sql @@ -0,0 +1,17 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT 1 as num; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py new file mode 100644 index 000000000000..204f9063297e --- /dev/null +++ b/integration-tests/test_psql_parity.py @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import numpy as np +import io +import os +import subprocess +from pathlib import Path +import unittest + +pg_db, pg_user, pg_host, pg_port = [ + os.environ.get(i) + for i in ( + "POSTGRES_DB", + "POSTGRES_USER", + "POSTGRES_HOST", + "POSTGRES_PORT", + ) +] + + +def generate_csv_from_datafusion(fname: str): + return subprocess.check_output( + [ + "./target/debug/datafusion-cli", + "-f", + fname, + "--format", + "csv", + "-q", + ], + ) + + +def generate_csv_from_psql(fname: str): + return subprocess.check_output( + [ + "psql", + "-d", + pg_db, + "-h", + pg_host, + "-p", + pg_port, + "-U", + pg_user, + "-X", + "--csv", + "-f", + fname, + ] + ) + + +class PsqlParityTest(unittest.TestCase): + def test_parity(self): + root = Path(os.path.dirname(__file__)) / "sqls" + files = set(root.glob("*.sql")) + self.assertEqual(len(files), 2, msg="tests are missed") + for fname in files: + with self.subTest(fname=fname): + datafusion_output = pd.read_csv( + io.BytesIO(generate_csv_from_datafusion(fname)) + ) + psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) + self.assertTrue( + np.allclose(datafusion_output, psql_output), + msg=f"data fusion output={datafusion_output}, psql_output={psql_output}", + ) + + +if __name__ == "__main__": + unittest.main()