diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 89660657a7..2a6845a8c5 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -316,6 +316,9 @@ MILLER(1) MILLER(1) fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. + --csv-trim-leading-space Trims leading spaces in CSV data. Use this for data + like '"foo", "bar' which is non-RFC-4180 compliant, + but common. --headerless-csv-output or --ho or --headerless-tsv-output Print only CSV/TSV data lines; do not print CSV/TSV header lines. @@ -3354,5 +3357,5 @@ MILLER(1) MILLER(1) - 2023-04-16 MILLER(1) + 2023-04-20 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 3469324360..77149c34b2 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -295,6 +295,9 @@ MILLER(1) MILLER(1) fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. + --csv-trim-leading-space Trims leading spaces in CSV data. Use this for data + like '"foo", "bar' which is non-RFC-4180 compliant, + but common. --headerless-csv-output or --ho or --headerless-tsv-output Print only CSV/TSV data lines; do not print CSV/TSV header lines. @@ -3333,4 +3336,4 @@ MILLER(1) MILLER(1) - 2023-04-16 MILLER(1) + 2023-04-20 MILLER(1) diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index e3bda0ef57..24e2cbc388 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -117,6 +117,7 @@ These are flags which are applicable to CSV format. **Flags:** * `--allow-ragged-csv-input or --ragged or --allow-ragged-tsv-input`: If a data line has fewer fields than the header line, fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. +* `--csv-trim-leading-space`: Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common. * `--headerless-csv-output or --ho or --headerless-tsv-output`: Print only CSV/TSV data lines; do not print CSV/TSV header lines. * `--implicit-csv-header or --headerless-csv-input or --hi or --implicit-tsv-header`: Use 1,2,3,... as field labels, rather than from line 1 of input files. Tip: combine with `label` to recreate missing headers. * `--lazy-quotes`: Accepts quotes appearing in unquoted fields, and non-doubled quotes appearing in quoted fields. diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index 254abdbfb4..cb01c27410 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -2173,6 +2173,15 @@ var CSVTSVOnlyFlagSection = FlagSection{ }, }, + { + name: "--csv-trim-leading-space", + help: `Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.CSVTrimLeadingSpace = true + *pargi += 1 + }, + }, + { name: "--quote-all", help: "Force double-quoting of CSV fields.", diff --git a/internal/pkg/cli/option_types.go b/internal/pkg/cli/option_types.go index 63013ec306..06f71ad278 100644 --- a/internal/pkg/cli/option_types.go +++ b/internal/pkg/cli/option_types.go @@ -56,6 +56,7 @@ type TReaderOptions struct { UseImplicitCSVHeader bool AllowRaggedCSVInput bool CSVLazyQuotes bool + CSVTrimLeadingSpace bool CommentHandling TCommentHandling CommentString string diff --git a/internal/pkg/input/record_reader_csv.go b/internal/pkg/input/record_reader_csv.go index 8829d2a6aa..fc1da8dc73 100644 --- a/internal/pkg/input/record_reader_csv.go +++ b/internal/pkg/input/record_reader_csv.go @@ -18,10 +18,11 @@ import ( // ---------------------------------------------------------------- type RecordReaderCSV struct { - readerOptions *cli.TReaderOptions - recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl - ifs0 byte // Go's CSV library only lets its 'Comma' be a single character - csvLazyQuotes bool // Maps directly to Go's CSV library's LazyQuotes + readerOptions *cli.TReaderOptions + recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl + ifs0 byte // Go's CSV library only lets its 'Comma' be a single character + csvLazyQuotes bool // Maps directly to Go's CSV library's LazyQuotes + csvTrimLeadingSpace bool // Maps directly to Go's CSV library's TrimLeadingSpace filename string rowNumber int64 @@ -40,10 +41,11 @@ func NewRecordReaderCSV( return nil, fmt.Errorf("for CSV, IFS can only be a single character") } return &RecordReaderCSV{ - readerOptions: readerOptions, - ifs0: readerOptions.IFS[0], - recordsPerBatch: recordsPerBatch, - csvLazyQuotes: readerOptions.CSVLazyQuotes, + readerOptions: readerOptions, + ifs0: readerOptions.IFS[0], + recordsPerBatch: recordsPerBatch, + csvLazyQuotes: readerOptions.CSVLazyQuotes, + csvTrimLeadingSpace: readerOptions.CSVTrimLeadingSpace, }, nil } @@ -105,6 +107,7 @@ func (reader *RecordReaderCSV) processHandle( csvReader := csv.NewReader(NewBOMStrippingReader(handle)) csvReader.Comma = rune(reader.ifs0) csvReader.LazyQuotes = reader.csvLazyQuotes + csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace csvRecordsChannel := make(chan *list.List, recordsPerBatch) go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel, recordsPerBatch) diff --git a/man/manpage.txt b/man/manpage.txt index 3469324360..77149c34b2 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -295,6 +295,9 @@ MILLER(1) MILLER(1) fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. + --csv-trim-leading-space Trims leading spaces in CSV data. Use this for data + like '"foo", "bar' which is non-RFC-4180 compliant, + but common. --headerless-csv-output or --ho or --headerless-tsv-output Print only CSV/TSV data lines; do not print CSV/TSV header lines. @@ -3333,4 +3336,4 @@ MILLER(1) MILLER(1) - 2023-04-16 MILLER(1) + 2023-04-20 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index 8fd2772df9..b3545e5206 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2023-04-16 +.\" Date: 2023-04-20 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2023-04-16" "\ \&" "\ \&" +.TH "MILLER" "1" "2023-04-20" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -366,6 +366,9 @@ These are flags which are applicable to CSV format. fill remaining keys with empty string. If a data line has more fields than the header line, use integer field labels as in the implicit-header case. +--csv-trim-leading-space Trims leading spaces in CSV data. Use this for data + like '"foo", "bar' which is non-RFC-4180 compliant, + but common. --headerless-csv-output or --ho or --headerless-tsv-output Print only CSV/TSV data lines; do not print CSV/TSV header lines. diff --git a/test/cases/help/0018/expout b/test/cases/help/0018/expout index 55b7d1da78..4616c14568 100644 --- a/test/cases/help/0018/expout +++ b/test/cases/help/0018/expout @@ -1,3 +1,5 @@ +--csv-trim-leading-space +Trims leading spaces in CSV data. Use this for data like '"foo", "bar' which is non-RFC-4180 compliant, but common. --csv Use CSV format for input and output data. --csvlite