From d74c5801e9e214072925a843f829c9c02cc51b6a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 25 Dec 2023 20:16:05 -0500 Subject: [PATCH 1/7] extend process_input helper to process "infile-list" files "infile-list" files is qsv's flavor of the "infile-list" support of csvtk as per #1293 In our implementation, providing a file with the ".infile-list" extension to commands that support it (currently, `sqlp` and `to`) will read the file as a list of input files to use for the command. Will add ".infile-list" support to `cat` and `headers` command as well --- src/util.rs | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/util.rs b/src/util.rs index 84a14c298..969f5fcba 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1441,7 +1441,8 @@ pub fn isutf8_file(path: &Path) -> Result<bool, CliError> { /// If it's not empty, check the input files if they exist, and return an error if they don't /// /// If the input is a directory, add all the files in the directory to the input -/// If the input is a file, add the file to the input +/// If the input is a file with the extension ".infile-list" read the file, and add each line as a +/// file to the input If the input is a file, add the file to the input /// If the input are snappy compressed files, uncompress them before adding them to the input pub fn process_input( mut arg_input: Vec<PathBuf>, @@ -1464,11 +1465,36 @@ pub fn process_input( arg_input.remove(0); } - let work_input = if arg_input.len() == 1 && arg_input[0].is_dir() { - // if the input is a directory, add all the files in the directory to the input - std::fs::read_dir(&arg_input[0])? - .map(|entry| entry.map(|e| e.path())) - .collect::<Result<Vec<_>, _>>()? + let work_input = if arg_input.len() == 1 { + let input_path = &arg_input[0]; + if input_path.is_dir() { + // if the input is a directory, add all the files in the directory to the input + std::fs::read_dir(input_path)? + .map(|entry| entry.map(|e| e.path())) + .collect::<Result<Vec<_>, _>>()? + } else if input_path.is_file() { + // if the input is a file and has the extension "infile-list" case-insensitive, + // read the file. Each line is a file path + if input_path + .extension() + .and_then(std::ffi::OsStr::to_str) + .map(str::to_lowercase) + == Some("infile-list".to_string()) + { + let mut input_file = std::fs::File::open(input_path)?; + let mut input_file_contents = String::new(); + input_file.read_to_string(&mut input_file_contents)?; + input_file_contents + .lines() + .map(PathBuf::from) + .collect::<Vec<_>>() + } else { + // if the input is not an ".infile-list" file, add the file to the input + arg_input + } + } else { + arg_input + } } else { arg_input }; From 2eb43cba3f4b97ee4eaf7cdb32c88b9afa7ff81d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 25 Dec 2023 20:16:36 -0500 Subject: [PATCH 2/7] `to`: document `.infile-list` support --- src/cmd/to.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/cmd/to.rs b/src/cmd/to.rs index 1f1b21ec4..7aa9bdcb3 100644 --- a/src/cmd/to.rs +++ b/src/cmd/to.rs @@ -26,7 +26,11 @@ Load same files into a new/existing postgres database whose connection string is Load files inside a directory to a local database 'test' with user `testuser`, password `pass`. - $ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1 + $ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1 + +Load files listed in the 'input.infile-list' to a local database 'test' with user `testuser`, password `pass`. + + $ qsv to postgres 'postgres://testuser:pass@localhost/test' input.infile-list Drop tables if they exist before loading. @@ -61,6 +65,10 @@ Load all files in dir1 to sqlite database `test.db` $ qsv to sqlite test.db dir +Load files listed in the 'mydata.infile-list' to sqlite database `test.db` + + $ qsv to sqlite test.db mydata.infile-list + Drop tables if they exist before loading. $ qsv to sqlite test.db --drop file1.csv file2.csv @@ -90,6 +98,14 @@ filename without the extension. Note the `output.xlsx` will be overwritten if it $ qsv to xlsx output.xlsx file1.csv file2.csv +Load all files in dir1 into xlsx file. + + $ qsv to xlsx output.xlsx dir1 + +Load files listed in the 'ourdata.infile-list' into xlsx file. + + $ qsv to xlsx output.xlsx ourdata.infile-list + PARQUET (only available if compiled with `to_parquet` feature) Convert to directory of parquet files. Need to select a directory, it will be created if it does not exists. If the `to_parquet` feature is not enabled, a simpler parquet conversion is available using the `sqlp` @@ -108,6 +124,14 @@ Convert from stdin. $ qsv to parquet --pipe mydir - +Convert all files in dir1 into parquet files in myparquetdir. + + $ qsv to parquet myparquetdir dir1 + +Convert files listed in the 'data.infile-list' into parquet files in myparquetdir. + + $ qsv to parquet myparquetdir data.infile-list + DATAPACKAGE Generate a datapackage, which contains stats and information about what is in the CSV files. @@ -125,6 +149,10 @@ Generate a `datapackage.json` file from all the files in dir1 $ qsv to datapackage datapackage.json dir1 +Generate a `datapackage.json` file from all the files listed in the 'data.infile-list' + + $ qsv to datapackage datapackage.json data.infile-list + For all other conversions you can output the datapackage created by specifying `--print-package`. $ qsv to xlsx datapackage.xlsx --stats --print-package file1.csv file2.csv From 602a93a0888a451a66b14e6190f0174d556e347e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 25 Dec 2023 20:17:24 -0500 Subject: [PATCH 3/7] `sqlp`: document `.infile-list` support --- src/cmd/sqlp.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index e2da084ea..583cf652b 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -100,8 +100,9 @@ Usage: sqlp arguments: input The CSV file/s to query. Use '-' for standard input. - If input is a directory, all CSV files in the directory will - be used. + If input is a directory, all files in the directory will be read as input. + If the input is a file with a '.infile-list' extension, the + file will be read as a list of files to use as input. If the input are snappy compressed file(s), it will be decompressed automatically. Column headers are required. Use 'qsv rename _all_generic --no-headers' From 93d82fc26e7ea088ee6ebd490b531f73f1669e57 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Dec 2023 00:42:35 -0500 Subject: [PATCH 4/7] refactor many_configs() helper - to accept &[PathBuf] instead of &[String] - refactor process_input helper to make custom_empty_stdin_errmsg parm have a default value, to reduce unnecessary literals using process_input --- src/util.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/util.rs b/src/util.rs index 969f5fcba..310d6a6e3 100644 --- a/src/util.rs +++ b/src/util.rs @@ -406,12 +406,15 @@ where #[inline] pub fn many_configs( - inps: &[String], + inps: &[PathBuf], delim: Option<Delimiter>, no_headers: bool, flexible: bool, ) -> Result<Vec<Config>, String> { - let mut inps = inps.to_vec(); + let mut inps = inps + .iter() + .map(|p| p.to_str().unwrap_or("-").to_owned()) + .collect::<Vec<_>>(); if inps.is_empty() { inps.push("-".to_owned()); // stdin } @@ -1447,7 +1450,7 @@ pub fn isutf8_file(path: &Path) -> Result<bool, CliError> { pub fn process_input( mut arg_input: Vec<PathBuf>, tmpdir: &tempfile::TempDir, - empty_stdin_errmsg: &str, + custom_empty_stdin_errmsg: &str, ) -> Result<Vec<PathBuf>, CliError> { let mut processed_input = Vec::with_capacity(arg_input.len()); @@ -1528,7 +1531,12 @@ pub fn process_input( } if processed_input.is_empty() { - return fail_clierror!("{empty_stdin_errmsg}"); + if custom_empty_stdin_errmsg.is_empty() { + return fail_clierror!( + "No data on stdin. Please provide at least one input file or pipe data to stdin." + ); + } + return fail_clierror!("{custom_empty_stdin_errmsg}"); } log::debug!("processed input: {:?}", processed_input); Ok(processed_input) From 933ddcd05e753fb8bd7c8e520fdacc9a0652b623 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Dec 2023 00:46:36 -0500 Subject: [PATCH 5/7] `describegpt`, `sqlp`, `to`, `tojsonl`: adapt to use new process_input with default custom_stdin_errmsg --- src/cmd/describegpt.rs | 2 +- src/cmd/sqlp.rs | 6 +----- src/cmd/to.rs | 37 ++++++++----------------------------- src/cmd/tojsonl.rs | 2 +- 4 files changed, 11 insertions(+), 36 deletions(-) diff --git a/src/cmd/describegpt.rs b/src/cmd/describegpt.rs index 95a3346f0..d2b04b3f7 100644 --- a/src/cmd/describegpt.rs +++ b/src/cmd/describegpt.rs @@ -572,7 +572,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.arg_input.clone().unwrap_or_else(|| "-".to_string()), )], &tmpdir, - "No data on stdin. Please provide at least one input file or pipe data to stdin.", + "", )?; // safety: we just checked that there is at least one input file let input_path = work_input[0] diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index 583cf652b..9981fcb22 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -376,11 +376,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut args: Args = util::get_args(USAGE, argv)?; let tmpdir = tempfile::tempdir()?; - args.arg_input = process_input( - args.arg_input, - &tmpdir, - "No data on stdin. Please provide at least one input file or pipe data to stdin.", - )?; + args.arg_input = process_input(args.arg_input, &tmpdir, "")?; let rnull_values = if args.flag_rnull_values == "<empty string>" { vec![String::new()] diff --git a/src/cmd/to.rs b/src/cmd/to.rs index 7aa9bdcb3..aa9fddeca 100644 --- a/src/cmd/to.rs +++ b/src/cmd/to.rs @@ -246,6 +246,9 @@ impl From<csvs_convert::DescribeError> for CliError { } } +static EMPTY_STDIN_ERRMSG: &str = + "No data on stdin. Need to add connection string as first argument then the input CSVs"; + pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; debug!("'to' command running"); @@ -267,11 +270,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { if args.cmd_postgres { debug!("converting to postgres"); - arg_input = process_input( - arg_input, - &tmpdir, - "No data on stdin. Need to add connection string as first argument then the input CSVs", - )?; + arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?; if args.flag_dump { options.dump_file = args.arg_postgres.expect("checked above"); output = csvs_to_postgres_with_options(String::new(), arg_input, options)?; @@ -285,12 +284,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { debug!("conversion to postgres complete"); } else if args.cmd_sqlite { debug!("converting to sqlite"); - arg_input = process_input( - arg_input, - &tmpdir, - "No data on stdin. Need to add the name of a sqlite db as first argument then the \ - input CSVs", - )?; + arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?; if args.flag_dump { options.dump_file = args.arg_sqlite.expect("checked above"); output = csvs_to_sqlite_with_options(String::new(), arg_input, options)?; @@ -306,12 +300,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { #[cfg(all(feature = "to_parquet", feature = "feature_capable"))] { debug!("converting to parquet"); - arg_input = process_input( - arg_input, - &tmpdir, - "No data on stdin. Need to add the name of a parquet directory as first argument \ - then the input CSVs", - )?; + arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?; output = csvs_to_parquet_with_options( args.arg_parquet.expect("checked above"), arg_input, @@ -327,24 +316,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } else if args.cmd_xlsx { debug!("converting to xlsx"); - arg_input = process_input( - arg_input, - &tmpdir, - "No data on stdin. Need to add the name of an xlsx file as first argument then the \ - input CSVs", - )?; + arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?; output = csvs_to_xlsx_with_options(args.arg_xlsx.expect("checked above"), arg_input, options)?; debug!("conversion to xlsx complete"); } else if args.cmd_datapackage { debug!("creating datapackage"); - arg_input = process_input( - arg_input, - &tmpdir, - "No data on stdin. Need to add the name of a datapackage file as first argument then \ - the input CSVs", - )?; + arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?; let describe_options = DescribeOptions::builder() .delimiter(options.delimiter) diff --git a/src/cmd/tojsonl.rs b/src/cmd/tojsonl.rs index 9e0efd7fa..16a0bcadf 100644 --- a/src/cmd/tojsonl.rs +++ b/src/cmd/tojsonl.rs @@ -87,7 +87,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.arg_input.clone().unwrap_or_else(|| "-".to_string()), )], &tmpdir, - "No data on stdin. Please provide at least one input file or pipe data to stdin.", + "", )?; // safety: there's at least one valid element in work_input From 54977d80ba0c1e7fdda71b7a057b83d2529cf949 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Dec 2023 00:50:58 -0500 Subject: [PATCH 6/7] `cat`: now uses process_input so it can process dir input or `.infile-list` file input --- src/cmd/cat.rs | 18 +++++++- tests/test_cat.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 2 deletions(-) diff --git a/src/cmd/cat.rs b/src/cmd/cat.rs index 315457abb..af4e30c9c 100644 --- a/src/cmd/cat.rs +++ b/src/cmd/cat.rs @@ -29,6 +29,15 @@ Usage: qsv cat columns [options] [<input>...] qsv cat --help +cat arguments: + <input>... The CSV file(s) to read. Use '-' for standard input. + If input is a directory, all files in the directory will + be read as input. + If the input is a file with a '.infile-list' extension, + the file will be read as a list of input files. + If the input are snappy-compressed files(s), it will be + decompressed automatically. + cat options: COLUMNS OPTION: -p, --pad When concatenating columns, this flag will cause @@ -57,6 +66,8 @@ Common options: Must be a single character. (default: ,) "#; +use std::path::PathBuf; + use indexmap::{IndexMap, IndexSet}; use serde::Deserialize; use tempfile; @@ -73,7 +84,7 @@ struct Args { cmd_columns: bool, flag_group: bool, flag_group_name: String, - arg_input: Vec<String>, + arg_input: Vec<PathBuf>, flag_pad: bool, flag_flexible: bool, flag_output: Option<String>, @@ -82,7 +93,10 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = util::get_args(USAGE, argv)?; + let mut args: Args = util::get_args(USAGE, argv)?; + + let tmpdir = tempfile::tempdir()?; + args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?; if args.cmd_rows { args.cat_rows() } else if args.cmd_rowskey { diff --git a/tests/test_cat.rs b/tests/test_cat.rs index a705625a5..eb605ea05 100644 --- a/tests/test_cat.rs +++ b/tests/test_cat.rs @@ -171,6 +171,58 @@ fn cat_rows_flexible() { assert_eq!(got, expected); } +#[test] +fn cat_rows_flexible_infile() { + let wrk = Workdir::new("cat_rows_flexible_infile"); + wrk.create( + "in1.csv", + vec![ + svec!["a", "b", "c"], + svec!["1", "2", "3"], + svec!["2", "3", "4"], + ], + ); + + wrk.create( + "in2.csv", + vec![ + svec!["a", "b", "c"], + svec!["3", "1", "2"], + svec!["4", "2", "3"], + ], + ); + + wrk.create( + "in3.csv", + vec![ + svec!["a", "b", "c", "d"], + svec!["1", "2", "4", "3"], + svec!["2", "3", "5", "4"], + svec!["z", "y", "w", "x"], + ], + ); + + wrk.create_from_string("testdata.infile-list", "in1.csv\nin2.csv\nin3.csv\n"); + + let mut cmd = wrk.command("cat"); + cmd.arg("rows") + .arg("--flexible") + .arg("testdata.infile-list"); + + let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["a", "b", "c"], + svec!["1", "2", "3"], + svec!["2", "3", "4"], + svec!["3", "1", "2"], + svec!["4", "2", "3"], + svec!["1", "2", "4", "3"], + svec!["2", "3", "5", "4"], + svec!["z", "y", "w", "x"], + ]; + assert_eq!(got, expected); +} + #[test] fn cat_rowskey_grouping() { let wrk = Workdir::new("cat_rowskey_grouping"); @@ -223,6 +275,58 @@ fn cat_rowskey_grouping() { assert_eq!(got, expected); } +#[test] +fn cat_rowskey_grouping_infile() { + let wrk = Workdir::new("cat_rowskey_grouping_infile"); + wrk.create( + "in1.csv", + vec![ + svec!["a", "b", "c"], + svec!["1", "2", "3"], + svec!["2", "3", "4"], + ], + ); + + wrk.create( + "in2.csv", + vec![ + svec!["c", "a", "b"], + svec!["3", "1", "2"], + svec!["4", "2", "3"], + ], + ); + + wrk.create( + "in3.csv", + vec![ + svec!["a", "b", "d", "c"], + svec!["1", "2", "4", "3"], + svec!["2", "3", "5", "4"], + svec!["z", "y", "w", "x"], + ], + ); + + wrk.create_from_string("testdata.infile-list", "in1.csv\nin2.csv\nin3.csv\n"); + + let mut cmd = wrk.command("cat"); + cmd.arg("rowskey") + .arg("--group") + .arg("testdata.infile-list"); + + let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["file", "a", "b", "c", "d"], + svec!["in1", "1", "2", "3", ""], + svec!["in1", "2", "3", "4", ""], + svec!["in2", "1", "2", "3", ""], + svec!["in2", "2", "3", "4", ""], + svec!["in3", "1", "2", "3", "4"], + svec!["in3", "2", "3", "4", "5"], + svec!["in3", "z", "y", "x", "w"], + ]; + assert_eq!(got, expected); +} + #[test] fn cat_rowskey_grouping_customname() { let wrk = Workdir::new("cat_rowskey_grouping_customname"); From fc56590a27fa7d394c303e3c7ba699a4e2f6a2a5 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 26 Dec 2023 00:55:05 -0500 Subject: [PATCH 7/7] `headers`: now also accepts process_input - dir input; `infile-list`; multiple input; auto decompress snappy --- src/cmd/headers.rs | 17 ++++++++++--- tests/test_headers.rs | 55 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/src/cmd/headers.rs b/src/cmd/headers.rs index 65e4ea9c0..647da2fa6 100644 --- a/src/cmd/headers.rs +++ b/src/cmd/headers.rs @@ -13,6 +13,15 @@ Usage: qsv headers [options] [<input>...] qsv headers --help +headers arguments: + <input>... The CSV file(s) to read. Use '-' for standard input. + If input is a directory, all files in the directory will + be read as input. + If the input is a file with a '.infile-list' extension, + the file will be read as a list of input files. + If the input are snappy-compressed files(s), it will be + decompressed automatically. + headers options: -j, --just-names Only show the header names (hide column index). This is automatically enabled if more than one @@ -27,7 +36,7 @@ Common options: Must be a single character. (default: ,) "#; -use std::io; +use std::{io, path::PathBuf}; use serde::Deserialize; use tabwriter::TabWriter; @@ -36,7 +45,7 @@ use crate::{config::Delimiter, util, CliResult}; #[derive(Deserialize)] struct Args { - arg_input: Vec<String>, + arg_input: Vec<PathBuf>, flag_just_names: bool, flag_intersect: bool, flag_trim: bool, @@ -44,7 +53,9 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = util::get_args(USAGE, argv)?; + let mut args: Args = util::get_args(USAGE, argv)?; + let tmpdir = tempfile::tempdir()?; + args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?; let configs = util::many_configs(&args.arg_input, args.flag_delimiter, true, false)?; let num_inputs = configs.len(); diff --git a/tests/test_headers.rs b/tests/test_headers.rs index ffc974d60..62dd36f7b 100644 --- a/tests/test_headers.rs +++ b/tests/test_headers.rs @@ -171,3 +171,58 @@ h2 h3"; assert_eq!(got, expected.to_string()); } + +#[test] +fn headers_infile() { + let wrk = Workdir::new("headers_infile"); + wrk.create("in1.csv", vec![svec!["a", "b", "c"], svec!["1", "2", "3"]]); + + wrk.create("in2.csv", vec![svec!["c", "d", "e"], svec!["3", "1", "2"]]); + + wrk.create( + "in3.csv", + vec![svec!["a", "b", "f", "g"], svec!["1", "2", "4", "3"]], + ); + + wrk.create_from_string("testdata.infile-list", "in1.csv\nin2.csv\nin3.csv\n"); + + let mut cmd = wrk.command("headers"); + cmd.arg("testdata.infile-list"); + + let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd); + let expected = vec![ + ["a"], + ["b"], + ["c"], + ["c"], + ["d"], + ["e"], + ["a"], + ["b"], + ["f"], + ["g"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn headers_intersect_infile() { + let wrk = Workdir::new("headers_intersect_infile"); + wrk.create("in1.csv", vec![svec!["a", "b", "c"], svec!["1", "2", "3"]]); + + wrk.create("in2.csv", vec![svec!["c", "d", "e"], svec!["3", "1", "2"]]); + + wrk.create( + "in3.csv", + vec![svec!["a", "b", "f", "g"], svec!["1", "2", "4", "3"]], + ); + + wrk.create_from_string("testdata.infile-list", "in1.csv\nin2.csv\nin3.csv\n"); + + let mut cmd = wrk.command("headers"); + cmd.arg("--intersect").arg("testdata.infile-list"); + + let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd); + let expected = vec![["a"], ["b"], ["c"], ["d"], ["e"], ["f"], ["g"]]; + assert_eq!(got, expected); +}