Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor commands that accept multiple input files to use improved process_input helper #1496

Merged
merged 7 commits into from
Dec 26, 2023
18 changes: 16 additions & 2 deletions src/cmd/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ Usage:
qsv cat columns [options] [<input>...]
qsv cat --help

cat arguments:
<input>... The CSV file(s) to read. Use '-' for standard input.
If input is a directory, all files in the directory will
be read as input.
If the input is a file with a '.infile-list' extension,
the file will be read as a list of input files.
If the input are snappy-compressed files(s), it will be
decompressed automatically.

cat options:
COLUMNS OPTION:
-p, --pad When concatenating columns, this flag will cause
Expand Down Expand Up @@ -57,6 +66,8 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::path::PathBuf;

use indexmap::{IndexMap, IndexSet};
use serde::Deserialize;
use tempfile;
Expand All @@ -73,7 +84,7 @@ struct Args {
cmd_columns: bool,
flag_group: bool,
flag_group_name: String,
arg_input: Vec<String>,
arg_input: Vec<PathBuf>,
flag_pad: bool,
flag_flexible: bool,
flag_output: Option<String>,
Expand All @@ -82,7 +93,10 @@ struct Args {
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut args: Args = util::get_args(USAGE, argv)?;

let tmpdir = tempfile::tempdir()?;
args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?;
if args.cmd_rows {
args.cat_rows()
} else if args.cmd_rowskey {
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/describegpt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.arg_input.clone().unwrap_or_else(|| "-".to_string()),
)],
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
"",
)?;
// safety: we just checked that there is at least one input file
let input_path = work_input[0]
Expand Down
17 changes: 14 additions & 3 deletions src/cmd/headers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ Usage:
qsv headers [options] [<input>...]
qsv headers --help

headers arguments:
<input>... The CSV file(s) to read. Use '-' for standard input.
If input is a directory, all files in the directory will
be read as input.
If the input is a file with a '.infile-list' extension,
the file will be read as a list of input files.
If the input are snappy-compressed files(s), it will be
decompressed automatically.

headers options:
-j, --just-names Only show the header names (hide column index).
This is automatically enabled if more than one
Expand All @@ -27,7 +36,7 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::io;
use std::{io, path::PathBuf};

use serde::Deserialize;
use tabwriter::TabWriter;
Expand All @@ -36,15 +45,17 @@ use crate::{config::Delimiter, util, CliResult};

#[derive(Deserialize)]
struct Args {
arg_input: Vec<String>,
arg_input: Vec<PathBuf>,
flag_just_names: bool,
flag_intersect: bool,
flag_trim: bool,
flag_delimiter: Option<Delimiter>,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut args: Args = util::get_args(USAGE, argv)?;
let tmpdir = tempfile::tempdir()?;
args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?;
let configs = util::many_configs(&args.arg_input, args.flag_delimiter, true, false)?;

let num_inputs = configs.len();
Expand Down
11 changes: 4 additions & 7 deletions src/cmd/sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ Usage:

sqlp arguments:
input The CSV file/s to query. Use '-' for standard input.
If input is a directory, all CSV files in the directory will
be used.
If input is a directory, all files in the directory will be read as input.
If the input is a file with a '.infile-list' extension, the
file will be read as a list of files to use as input.
If the input are snappy compressed file(s), it will be
decompressed automatically.
Column headers are required. Use 'qsv rename _all_generic --no-headers'
Expand Down Expand Up @@ -375,11 +376,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;

let tmpdir = tempfile::tempdir()?;
args.arg_input = process_input(
args.arg_input,
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
)?;
args.arg_input = process_input(args.arg_input, &tmpdir, "")?;

let rnull_values = if args.flag_rnull_values == "<empty string>" {
vec![String::new()]
Expand Down
67 changes: 37 additions & 30 deletions src/cmd/to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ Load same files into a new/existing postgres database whose connection string is

Load files inside a directory to a local database 'test' with user `testuser`, password `pass`.

$ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1
$ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1

Check notice

Code scanning / devskim

Accessing localhost could indicate debug code, or could hinder scaling.

Do not leave debug code in production

Load files listed in the 'input.infile-list' to a local database 'test' with user `testuser`, password `pass`.

$ qsv to postgres 'postgres://testuser:pass@localhost/test' input.infile-list

Check notice

Code scanning / devskim

Accessing localhost could indicate debug code, or could hinder scaling.

Do not leave debug code in production

Drop tables if they exist before loading.

Expand Down Expand Up @@ -61,6 +65,10 @@ Load all files in dir1 to sqlite database `test.db`

$ qsv to sqlite test.db dir

Load files listed in the 'mydata.infile-list' to sqlite database `test.db`

$ qsv to sqlite test.db mydata.infile-list

Drop tables if they exist before loading.

$ qsv to sqlite test.db --drop file1.csv file2.csv
Expand Down Expand Up @@ -90,6 +98,14 @@ filename without the extension. Note the `output.xlsx` will be overwritten if it

$ qsv to xlsx output.xlsx file1.csv file2.csv

Load all files in dir1 into xlsx file.

$ qsv to xlsx output.xlsx dir1

Load files listed in the 'ourdata.infile-list' into xlsx file.

$ qsv to xlsx output.xlsx ourdata.infile-list

PARQUET (only available if compiled with `to_parquet` feature)
Convert to directory of parquet files. Need to select a directory, it will be created if it does not exists.
If the `to_parquet` feature is not enabled, a simpler parquet conversion is available using the `sqlp`
Expand All @@ -108,6 +124,14 @@ Convert from stdin.

$ qsv to parquet --pipe mydir -

Convert all files in dir1 into parquet files in myparquetdir.

$ qsv to parquet myparquetdir dir1

Convert files listed in the 'data.infile-list' into parquet files in myparquetdir.

$ qsv to parquet myparquetdir data.infile-list

DATAPACKAGE
Generate a datapackage, which contains stats and information about what is in the CSV files.

Expand All @@ -125,6 +149,10 @@ Generate a `datapackage.json` file from all the files in dir1

$ qsv to datapackage datapackage.json dir1

Generate a `datapackage.json` file from all the files listed in the 'data.infile-list'

$ qsv to datapackage datapackage.json data.infile-list

For all other conversions you can output the datapackage created by specifying `--print-package`.

$ qsv to xlsx datapackage.xlsx --stats --print-package file1.csv file2.csv
Expand Down Expand Up @@ -218,6 +246,9 @@ impl From<csvs_convert::DescribeError> for CliError {
}
}

static EMPTY_STDIN_ERRMSG: &str =
"No data on stdin. Need to add connection string as first argument then the input CSVs";

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
debug!("'to' command running");
Expand All @@ -239,11 +270,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {

if args.cmd_postgres {
debug!("converting to postgres");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add connection string as first argument then the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
if args.flag_dump {
options.dump_file = args.arg_postgres.expect("checked above");
output = csvs_to_postgres_with_options(String::new(), arg_input, options)?;
Expand All @@ -257,12 +284,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
debug!("conversion to postgres complete");
} else if args.cmd_sqlite {
debug!("converting to sqlite");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a sqlite db as first argument then the \
input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
if args.flag_dump {
options.dump_file = args.arg_sqlite.expect("checked above");
output = csvs_to_sqlite_with_options(String::new(), arg_input, options)?;
Expand All @@ -278,12 +300,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
#[cfg(all(feature = "to_parquet", feature = "feature_capable"))]
{
debug!("converting to parquet");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a parquet directory as first argument \
then the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
output = csvs_to_parquet_with_options(
args.arg_parquet.expect("checked above"),
arg_input,
Expand All @@ -299,24 +316,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
} else if args.cmd_xlsx {
debug!("converting to xlsx");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of an xlsx file as first argument then the \
input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;

output =
csvs_to_xlsx_with_options(args.arg_xlsx.expect("checked above"), arg_input, options)?;
debug!("conversion to xlsx complete");
} else if args.cmd_datapackage {
debug!("creating datapackage");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a datapackage file as first argument then \
the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;

let describe_options = DescribeOptions::builder()
.delimiter(options.delimiter)
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/tojsonl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.arg_input.clone().unwrap_or_else(|| "-".to_string()),
)],
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
"",
)?;

// safety: there's at least one valid element in work_input
Expand Down
54 changes: 44 additions & 10 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,12 +406,15 @@ where

#[inline]
pub fn many_configs(
inps: &[String],
inps: &[PathBuf],
delim: Option<Delimiter>,
no_headers: bool,
flexible: bool,
) -> Result<Vec<Config>, String> {
let mut inps = inps.to_vec();
let mut inps = inps
.iter()
.map(|p| p.to_str().unwrap_or("-").to_owned())
.collect::<Vec<_>>();
if inps.is_empty() {
inps.push("-".to_owned()); // stdin
}
Expand Down Expand Up @@ -1441,12 +1444,13 @@ pub fn isutf8_file(path: &Path) -> Result<bool, CliError> {
/// If it's not empty, check the input files if they exist, and return an error if they don't
///
/// If the input is a directory, add all the files in the directory to the input
/// If the input is a file, add the file to the input
/// If the input is a file with the extension ".infile-list" read the file, and add each line as a
/// file to the input If the input is a file, add the file to the input
/// If the input are snappy compressed files, uncompress them before adding them to the input
pub fn process_input(
mut arg_input: Vec<PathBuf>,
tmpdir: &tempfile::TempDir,
empty_stdin_errmsg: &str,
custom_empty_stdin_errmsg: &str,
) -> Result<Vec<PathBuf>, CliError> {
let mut processed_input = Vec::with_capacity(arg_input.len());

Expand All @@ -1464,11 +1468,36 @@ pub fn process_input(
arg_input.remove(0);
}

let work_input = if arg_input.len() == 1 && arg_input[0].is_dir() {
// if the input is a directory, add all the files in the directory to the input
std::fs::read_dir(&arg_input[0])?
.map(|entry| entry.map(|e| e.path()))
.collect::<Result<Vec<_>, _>>()?
let work_input = if arg_input.len() == 1 {
let input_path = &arg_input[0];
if input_path.is_dir() {
// if the input is a directory, add all the files in the directory to the input
std::fs::read_dir(input_path)?
.map(|entry| entry.map(|e| e.path()))
.collect::<Result<Vec<_>, _>>()?
} else if input_path.is_file() {
// if the input is a file and has the extension "infile-list" case-insensitive,
// read the file. Each line is a file path
if input_path
.extension()
.and_then(std::ffi::OsStr::to_str)
.map(str::to_lowercase)
== Some("infile-list".to_string())
{
let mut input_file = std::fs::File::open(input_path)?;
let mut input_file_contents = String::new();
input_file.read_to_string(&mut input_file_contents)?;
input_file_contents
.lines()
.map(PathBuf::from)
.collect::<Vec<_>>()
} else {
// if the input is not an ".infile-list" file, add the file to the input
arg_input
}
} else {
arg_input
}
} else {
arg_input
};
Expand Down Expand Up @@ -1502,7 +1531,12 @@ pub fn process_input(
}

if processed_input.is_empty() {
return fail_clierror!("{empty_stdin_errmsg}");
if custom_empty_stdin_errmsg.is_empty() {
return fail_clierror!(
"No data on stdin. Please provide at least one input file or pipe data to stdin."
);
}
return fail_clierror!("{custom_empty_stdin_errmsg}");
}
log::debug!("processed input: {:?}", processed_input);
Ok(processed_input)
Expand Down
Loading