Skip to content

Commit

Permalink
Merge pull request #1496 from jqnatividad/1293-infile-list-support
Browse files Browse the repository at this point in the history
refactor commands that accept multiple input files to use improved process_input helper
  • Loading branch information
jqnatividad authored Dec 26, 2023
2 parents fc2e5b0 + fc56590 commit c162bd4
Show file tree
Hide file tree
Showing 9 changed files with 276 additions and 54 deletions.
18 changes: 16 additions & 2 deletions src/cmd/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ Usage:
qsv cat columns [options] [<input>...]
qsv cat --help
cat arguments:
<input>... The CSV file(s) to read. Use '-' for standard input.
If input is a directory, all files in the directory will
be read as input.
If the input is a file with a '.infile-list' extension,
the file will be read as a list of input files.
If the input are snappy-compressed files(s), it will be
decompressed automatically.
cat options:
COLUMNS OPTION:
-p, --pad When concatenating columns, this flag will cause
Expand Down Expand Up @@ -57,6 +66,8 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::path::PathBuf;

use indexmap::{IndexMap, IndexSet};
use serde::Deserialize;
use tempfile;
Expand All @@ -73,7 +84,7 @@ struct Args {
cmd_columns: bool,
flag_group: bool,
flag_group_name: String,
arg_input: Vec<String>,
arg_input: Vec<PathBuf>,
flag_pad: bool,
flag_flexible: bool,
flag_output: Option<String>,
Expand All @@ -82,7 +93,10 @@ struct Args {
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut args: Args = util::get_args(USAGE, argv)?;

let tmpdir = tempfile::tempdir()?;
args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?;
if args.cmd_rows {
args.cat_rows()
} else if args.cmd_rowskey {
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/describegpt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.arg_input.clone().unwrap_or_else(|| "-".to_string()),
)],
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
"",
)?;
// safety: we just checked that there is at least one input file
let input_path = work_input[0]
Expand Down
17 changes: 14 additions & 3 deletions src/cmd/headers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ Usage:
qsv headers [options] [<input>...]
qsv headers --help
headers arguments:
<input>... The CSV file(s) to read. Use '-' for standard input.
If input is a directory, all files in the directory will
be read as input.
If the input is a file with a '.infile-list' extension,
the file will be read as a list of input files.
If the input are snappy-compressed files(s), it will be
decompressed automatically.
headers options:
-j, --just-names Only show the header names (hide column index).
This is automatically enabled if more than one
Expand All @@ -27,7 +36,7 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::io;
use std::{io, path::PathBuf};

use serde::Deserialize;
use tabwriter::TabWriter;
Expand All @@ -36,15 +45,17 @@ use crate::{config::Delimiter, util, CliResult};

#[derive(Deserialize)]
struct Args {
arg_input: Vec<String>,
arg_input: Vec<PathBuf>,
flag_just_names: bool,
flag_intersect: bool,
flag_trim: bool,
flag_delimiter: Option<Delimiter>,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut args: Args = util::get_args(USAGE, argv)?;
let tmpdir = tempfile::tempdir()?;
args.arg_input = util::process_input(args.arg_input, &tmpdir, "")?;
let configs = util::many_configs(&args.arg_input, args.flag_delimiter, true, false)?;

let num_inputs = configs.len();
Expand Down
11 changes: 4 additions & 7 deletions src/cmd/sqlp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ Usage:
sqlp arguments:
input The CSV file/s to query. Use '-' for standard input.
If input is a directory, all CSV files in the directory will
be used.
If input is a directory, all files in the directory will be read as input.
If the input is a file with a '.infile-list' extension, the
file will be read as a list of files to use as input.
If the input are snappy compressed file(s), it will be
decompressed automatically.
Column headers are required. Use 'qsv rename _all_generic --no-headers'
Expand Down Expand Up @@ -375,11 +376,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;

let tmpdir = tempfile::tempdir()?;
args.arg_input = process_input(
args.arg_input,
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
)?;
args.arg_input = process_input(args.arg_input, &tmpdir, "")?;

let rnull_values = if args.flag_rnull_values == "<empty string>" {
vec![String::new()]
Expand Down
67 changes: 37 additions & 30 deletions src/cmd/to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ Load same files into a new/existing postgres database whose connection string is
Load files inside a directory to a local database 'test' with user `testuser`, password `pass`.
$ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1
$ qsv to postgres 'postgres://testuser:pass@localhost/test' dir1
Load files listed in the 'input.infile-list' to a local database 'test' with user `testuser`, password `pass`.
$ qsv to postgres 'postgres://testuser:pass@localhost/test' input.infile-list
Drop tables if they exist before loading.
Expand Down Expand Up @@ -61,6 +65,10 @@ Load all files in dir1 to sqlite database `test.db`
$ qsv to sqlite test.db dir
Load files listed in the 'mydata.infile-list' to sqlite database `test.db`
$ qsv to sqlite test.db mydata.infile-list
Drop tables if they exist before loading.
$ qsv to sqlite test.db --drop file1.csv file2.csv
Expand Down Expand Up @@ -90,6 +98,14 @@ filename without the extension. Note the `output.xlsx` will be overwritten if it
$ qsv to xlsx output.xlsx file1.csv file2.csv
Load all files in dir1 into xlsx file.
$ qsv to xlsx output.xlsx dir1
Load files listed in the 'ourdata.infile-list' into xlsx file.
$ qsv to xlsx output.xlsx ourdata.infile-list
PARQUET (only available if compiled with `to_parquet` feature)
Convert to directory of parquet files. Need to select a directory, it will be created if it does not exists.
If the `to_parquet` feature is not enabled, a simpler parquet conversion is available using the `sqlp`
Expand All @@ -108,6 +124,14 @@ Convert from stdin.
$ qsv to parquet --pipe mydir -
Convert all files in dir1 into parquet files in myparquetdir.
$ qsv to parquet myparquetdir dir1
Convert files listed in the 'data.infile-list' into parquet files in myparquetdir.
$ qsv to parquet myparquetdir data.infile-list
DATAPACKAGE
Generate a datapackage, which contains stats and information about what is in the CSV files.
Expand All @@ -125,6 +149,10 @@ Generate a `datapackage.json` file from all the files in dir1
$ qsv to datapackage datapackage.json dir1
Generate a `datapackage.json` file from all the files listed in the 'data.infile-list'
$ qsv to datapackage datapackage.json data.infile-list
For all other conversions you can output the datapackage created by specifying `--print-package`.
$ qsv to xlsx datapackage.xlsx --stats --print-package file1.csv file2.csv
Expand Down Expand Up @@ -218,6 +246,9 @@ impl From<csvs_convert::DescribeError> for CliError {
}
}

static EMPTY_STDIN_ERRMSG: &str =
"No data on stdin. Need to add connection string as first argument then the input CSVs";

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
debug!("'to' command running");
Expand All @@ -239,11 +270,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {

if args.cmd_postgres {
debug!("converting to postgres");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add connection string as first argument then the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
if args.flag_dump {
options.dump_file = args.arg_postgres.expect("checked above");
output = csvs_to_postgres_with_options(String::new(), arg_input, options)?;
Expand All @@ -257,12 +284,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
debug!("conversion to postgres complete");
} else if args.cmd_sqlite {
debug!("converting to sqlite");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a sqlite db as first argument then the \
input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
if args.flag_dump {
options.dump_file = args.arg_sqlite.expect("checked above");
output = csvs_to_sqlite_with_options(String::new(), arg_input, options)?;
Expand All @@ -278,12 +300,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
#[cfg(all(feature = "to_parquet", feature = "feature_capable"))]
{
debug!("converting to parquet");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a parquet directory as first argument \
then the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;
output = csvs_to_parquet_with_options(
args.arg_parquet.expect("checked above"),
arg_input,
Expand All @@ -299,24 +316,14 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
}
} else if args.cmd_xlsx {
debug!("converting to xlsx");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of an xlsx file as first argument then the \
input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;

output =
csvs_to_xlsx_with_options(args.arg_xlsx.expect("checked above"), arg_input, options)?;
debug!("conversion to xlsx complete");
} else if args.cmd_datapackage {
debug!("creating datapackage");
arg_input = process_input(
arg_input,
&tmpdir,
"No data on stdin. Need to add the name of a datapackage file as first argument then \
the input CSVs",
)?;
arg_input = process_input(arg_input, &tmpdir, EMPTY_STDIN_ERRMSG)?;

let describe_options = DescribeOptions::builder()
.delimiter(options.delimiter)
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/tojsonl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.arg_input.clone().unwrap_or_else(|| "-".to_string()),
)],
&tmpdir,
"No data on stdin. Please provide at least one input file or pipe data to stdin.",
"",
)?;

// safety: there's at least one valid element in work_input
Expand Down
54 changes: 44 additions & 10 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,12 +406,15 @@ where

#[inline]
pub fn many_configs(
inps: &[String],
inps: &[PathBuf],
delim: Option<Delimiter>,
no_headers: bool,
flexible: bool,
) -> Result<Vec<Config>, String> {
let mut inps = inps.to_vec();
let mut inps = inps
.iter()
.map(|p| p.to_str().unwrap_or("-").to_owned())
.collect::<Vec<_>>();
if inps.is_empty() {
inps.push("-".to_owned()); // stdin
}
Expand Down Expand Up @@ -1441,12 +1444,13 @@ pub fn isutf8_file(path: &Path) -> Result<bool, CliError> {
/// If it's not empty, check the input files if they exist, and return an error if they don't
///
/// If the input is a directory, add all the files in the directory to the input
/// If the input is a file, add the file to the input
/// If the input is a file with the extension ".infile-list" read the file, and add each line as a
/// file to the input If the input is a file, add the file to the input
/// If the input are snappy compressed files, uncompress them before adding them to the input
pub fn process_input(
mut arg_input: Vec<PathBuf>,
tmpdir: &tempfile::TempDir,
empty_stdin_errmsg: &str,
custom_empty_stdin_errmsg: &str,
) -> Result<Vec<PathBuf>, CliError> {
let mut processed_input = Vec::with_capacity(arg_input.len());

Expand All @@ -1464,11 +1468,36 @@ pub fn process_input(
arg_input.remove(0);
}

let work_input = if arg_input.len() == 1 && arg_input[0].is_dir() {
// if the input is a directory, add all the files in the directory to the input
std::fs::read_dir(&arg_input[0])?
.map(|entry| entry.map(|e| e.path()))
.collect::<Result<Vec<_>, _>>()?
let work_input = if arg_input.len() == 1 {
let input_path = &arg_input[0];
if input_path.is_dir() {
// if the input is a directory, add all the files in the directory to the input
std::fs::read_dir(input_path)?
.map(|entry| entry.map(|e| e.path()))
.collect::<Result<Vec<_>, _>>()?
} else if input_path.is_file() {
// if the input is a file and has the extension "infile-list" case-insensitive,
// read the file. Each line is a file path
if input_path
.extension()
.and_then(std::ffi::OsStr::to_str)
.map(str::to_lowercase)
== Some("infile-list".to_string())
{
let mut input_file = std::fs::File::open(input_path)?;
let mut input_file_contents = String::new();
input_file.read_to_string(&mut input_file_contents)?;
input_file_contents
.lines()
.map(PathBuf::from)
.collect::<Vec<_>>()
} else {
// if the input is not an ".infile-list" file, add the file to the input
arg_input
}
} else {
arg_input
}
} else {
arg_input
};
Expand Down Expand Up @@ -1502,7 +1531,12 @@ pub fn process_input(
}

if processed_input.is_empty() {
return fail_clierror!("{empty_stdin_errmsg}");
if custom_empty_stdin_errmsg.is_empty() {
return fail_clierror!(
"No data on stdin. Please provide at least one input file or pipe data to stdin."
);
}
return fail_clierror!("{custom_empty_stdin_errmsg}");
}
log::debug!("processed input: {:?}", processed_input);
Ok(processed_input)
Expand Down
Loading

0 comments on commit c162bd4

Please sign in to comment.