Skip to content

Commit

Permalink
add csv mode to datafusion cli
Browse files Browse the repository at this point in the history
  • Loading branch information
Jiayu Liu committed May 17, 2021
1 parent ed92673 commit f185486
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 17 deletions.
50 changes: 48 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ on:
pull_request:

jobs:

# build the library, a compilation step used by multiple steps below
linux-build-lib:
name: Build Libraries on AMD64 Rust ${{ matrix.rust }}
Expand Down Expand Up @@ -133,6 +132,53 @@ jobs:
# snmalloc requires cmake so build without default features
cargo test --no-default-features
integration-test:
name: "Integration Test"
needs: [linux-build-lib]
runs-on: ubuntu-latest
services:
postgres:
image: postgres:13
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: db_test
ports:
- 5432/tcp
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.8"
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install --upgrade numpy==1.20.3 pandas==1.2.4
- name: Allow access of psql
run: |
# make sure psql can access the server
echo "$POSTGRES_HOST:$POSTGRES_PORT:$POSTGRES_DB:$POSTGRES_USER:$POSTGRES_PASSWORD" | tee ~/.pgpass
chmod 0600 ~/.pgpass
psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'select 1 as num'
env:
POSTGRES_HOST: localhost
POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}
POSTGRES_DB: db_test
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
- name: Test Psql Parity
run: python -m unittest -v integration-tests/test_psql_parity.py
env:
POSTGRES_HOST: localhost
POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}
POSTGRES_DB: db_test
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres

windows-and-macos:
name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }}
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -242,7 +288,7 @@ jobs:
- name: Run Miri Checks
env:
RUST_BACKTRACE: full
RUST_LOG: 'trace'
RUST_LOG: "trace"
run: |
export MIRIFLAGS="-Zmiri-disable-isolation"
cargo miri setup
Expand Down
27 changes: 15 additions & 12 deletions datafusion-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
use clap::{crate_version, App, Arg};
use datafusion::error::Result;
use datafusion::execution::context::{ExecutionConfig, ExecutionContext};
use datafusion_cli::{print_format::PrintFormat, PrintOptions};
use datafusion_cli::{
print_format::{all_print_formats, PrintFormat},
PrintOptions,
};
use rustyline::Editor;
use std::env;
use std::fs::File;
Expand Down Expand Up @@ -63,14 +66,22 @@ pub async fn main() {
)
.arg(
Arg::with_name("format")
.help("Output format (possible values: table, csv, tsv, json)")
.help("Output format")
.long("format")
.default_value("table")
.validator(is_valid_format)
.possible_values(
&all_print_formats()
.iter()
.map(|format| format.to_string())
.collect::<Vec<_>>()
.iter()
.map(|i| i.as_str())
.collect::<Vec<_>>(),
)
.takes_value(true),
)
.arg(
Arg::with_name("quite")
Arg::with_name("quiet")
.help("Reduce printing other than the results and work quietly")
.short("q")
.long("quiet")
Expand Down Expand Up @@ -189,14 +200,6 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintO
rl.save_history(".history").ok();
}

fn is_valid_format(format: String) -> std::result::Result<(), String> {
if format.parse::<PrintFormat>().is_ok() {
Ok(())
} else {
Err(format!("Format '{}' not supported", format))
}
}

fn is_valid_file(dir: String) -> std::result::Result<(), String> {
if Path::new(&dir).is_file() {
Ok(())
Expand Down
30 changes: 30 additions & 0 deletions datafusion-cli/src/print_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use arrow::json::ArrayWriter;
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::arrow::util::pretty;
use datafusion::error::{DataFusionError, Result};
use std::fmt;
use std::str::FromStr;

/// Allow records to be printed in different formats
Expand All @@ -32,6 +33,16 @@ pub enum PrintFormat {
Json,
}

/// returns all print formats
pub fn all_print_formats() -> Vec<PrintFormat> {
vec![
PrintFormat::Csv,
PrintFormat::Tsv,
PrintFormat::Table,
PrintFormat::Json,
]
}

impl FromStr for PrintFormat {
type Err = ();
fn from_str(s: &str) -> std::result::Result<Self, ()> {
Expand All @@ -45,6 +56,17 @@ impl FromStr for PrintFormat {
}
}

impl fmt::Display for PrintFormat {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
Self::Csv => write!(f, "csv"),
Self::Tsv => write!(f, "tsv"),
Self::Table => write!(f, "table"),
Self::Json => write!(f, "json"),
}
}
}

fn print_batches_to_json(batches: &[RecordBatch]) -> Result<String> {
let mut bytes = vec![];
{
Expand Down Expand Up @@ -108,6 +130,14 @@ mod tests {
assert_eq!(PrintFormat::Table, format);
}

#[test]
fn test_to_str() {
assert_eq!("csv", PrintFormat::Csv.to_string());
assert_eq!("table", PrintFormat::Table.to_string());
assert_eq!("tsv", PrintFormat::Tsv.to_string());
assert_eq!("json", PrintFormat::Json.to_string());
}

#[test]
fn test_from_str_failure() {
assert_eq!(true, "pretty".parse::<PrintFormat>().is_err());
Expand Down
6 changes: 3 additions & 3 deletions datafusion/docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ The DataFusion CLI is a command-line interactive SQL utility that allows queries

Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/).

```sh
```bash
git clone https://github.com/apache/arrow-datafusion
cd arrow-datafusion/datafusion-cli
cargo run --release
Expand All @@ -35,7 +35,7 @@ cargo run --release

Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work.

```sh
```bash
git clone https://github.com/apache/arrow-datafusion
cd arrow-datafusion
docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli
Expand Down Expand Up @@ -64,7 +64,7 @@ Type `exit` or `quit` to exit the CLI.
Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files.

```sql
CREATE EXTERNAL TABLE taxi
CREATE EXTERNAL TABLE taxi
STORED AS PARQUET
LOCATION '/mnt/nyctaxi/tripdata.parquet';
```
Expand Down
15 changes: 15 additions & 0 deletions integration-tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
22 changes: 22 additions & 0 deletions integration-tests/sqls/simple_math_expressions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at

-- http://www.apache.org/licenses/LICENSE-2.0

-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

SELECT
abs(-1.1) as abs,
exp(2.0) as exp,
sin(3.0) as sin,
cos(4.0) as cos,
tan(5.0) as tan;
17 changes: 17 additions & 0 deletions integration-tests/sqls/simple_select.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at

-- http://www.apache.org/licenses/LICENSE-2.0

-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

SELECT 1 as num;
88 changes: 88 additions & 0 deletions integration-tests/test_psql_parity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import numpy as np
import io
import os
import subprocess
from pathlib import Path
import unittest

pg_db, pg_user, pg_host, pg_port = [
os.environ.get(i)
for i in ("POSTGRES_DB", "POSTGRES_USER", "POSTGRES_HOST", "POSTGRES_PORT")
]


def generate_csv_from_datafusion(fname: str):
return subprocess.check_output(
[
"cargo",
"run",
"--release",
"-q",
"--bin",
"datafusion-cli",
"--",
"-f",
fname,
"--format",
"csv",
"-q",
],
)


def generate_csv_from_psql(fname: str):
return subprocess.check_output(
[
"psql",
"-d",
pg_db,
"-h",
pg_host,
"-p",
pg_port,
"-U",
pg_user,
"-X",
"--csv",
"-f",
fname,
]
)


class PsqlParityTest(unittest.TestCase):
def test_parity(self):
root = Path(os.path.dirname(__file__)) / "sqls"
files = set(root.glob("*.sql"))
self.assertEqual(len(files), 2, msg="tests are missed")
for fname in files:
with self.subTest(fname=fname):
datafusion_output = pd.read_csv(
io.BytesIO(generate_csv_from_datafusion(fname))
)
psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname)))
self.assertTrue(
np.allclose(datafusion_output, psql_output),
msg=f"data fusion output={datafusion_output}, psql_output={psql_output}",
)


if __name__ == "__main__":
unittest.main()

0 comments on commit f185486

Please sign in to comment.