Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly determine CSV delimiter #17459

Merged
merged 32 commits into from
Oct 30, 2021
Merged
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fc8dd28
Fixes #16558 CSV delimiter determiner
richmahn Oct 25, 2021
6fe4f9f
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 25, 2021
c99d94e
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 26, 2021
58e98c5
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 27, 2021
56f0995
Fixes #16558 - properly determine CSV delmiiter
richmahn Oct 27, 2021
7da56ba
Moves quoteString to a new function
richmahn Oct 27, 2021
5b3a8d9
Adds big test with lots of commas for tab delimited csv
richmahn Oct 28, 2021
b0a6290
Adds comments
richmahn Oct 28, 2021
79ee659
Shortens the text of the test
richmahn Oct 28, 2021
f508559
Removes single quotes from regexp as only double quotes need to be se…
richmahn Oct 28, 2021
e0f2862
Fixes spelling
richmahn Oct 28, 2021
ab0583c
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 28, 2021
e466ab7
Fixes check of length as it probalby will only be 1e4, not greater
richmahn Oct 28, 2021
a54e6ac
Merge branch 'richmahn-16558-guess-delimiter' of github.com:richmahn/…
richmahn Oct 28, 2021
62d10e3
Makes sample size a const, properly removes truncated line
richmahn Oct 28, 2021
0a2f06d
Makes sample size a const, properly removes truncated line
richmahn Oct 28, 2021
85eda5f
Fixes comment
richmahn Oct 28, 2021
fc02c05
Fixes comment
richmahn Oct 28, 2021
baf37d3
tests for FormatError() function
richmahn Oct 28, 2021
32a9c36
Adds logic to find the limiter before or after a quoted value
richmahn Oct 28, 2021
e8ac38a
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 28, 2021
feabe0e
Simplifies regex
richmahn Oct 28, 2021
f808d73
Merge branch 'main' into richmahn-16558-guess-delimiter
zeripath Oct 29, 2021
0084114
Error tests
richmahn Oct 29, 2021
967da1d
Merge branch 'richmahn-16558-guess-delimiter' of github.com:richmahn/…
richmahn Oct 29, 2021
683def1
Error tests
richmahn Oct 29, 2021
95ede28
Update modules/csv/csv.go
richmahn Oct 30, 2021
b938db4
Update modules/csv/csv.go
richmahn Oct 30, 2021
772095d
Adds comments
richmahn Oct 30, 2021
8a54f7a
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 30, 2021
059ae3d
Update modules/csv/csv.go
zeripath Oct 30, 2021
88a2642
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fixes #16558 - properly determine CSV delmiiter
richmahn committed Oct 27, 2021
commit 56f099581adff2d3c23e996f24043501747831a6
63 changes: 25 additions & 38 deletions modules/csv/csv.go
Original file line number Diff line number Diff line change
@@ -7,7 +7,6 @@ package csv
import (
"bytes"
stdcsv "encoding/csv"
"errors"
"io"
"path/filepath"
"regexp"
@@ -18,7 +17,9 @@ import (
"code.gitea.io/gitea/modules/util"
)

var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
const maxLines = 10

var quoteRegexp = regexp.MustCompile(`["'](?:[^"'\\]|\\.)*["']`)

// CreateReader creates a csv.Reader with the given delimiter.
func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
@@ -70,53 +71,39 @@ func determineDelimiter(ctx *markup.RenderContext, data []byte) rune {

// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
func guessDelimiter(data []byte) rune {
maxLines := 10
// Removes quoted values so we don't have columns with new lines in them
text := quoteRegexp.ReplaceAllLiteralString(string(data), "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]

delimiters := []rune{',', ';', '\t', '|', '@'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
// Make the text just be maxLines or less without cut-off lines
lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
if len(lines) > maxLines {
// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
lines = lines[:maxLines]
} else if len(lines) > 1 && len(strings.Join(lines, "\n")) > 1e4 {
// max # of lines of text was somehow > 10k, so probalby the last line was cut off. We remove it so it isn't used, but only if lines > 1
lines = lines[:len(lines)-1]
}

return bestDelim
}

// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
func scoreDelimiter(lines []string, delim rune) float64 {
countTotal := 0
countLineMax := 0
linesNotEqual := 0
// Put our 1 to 10 lines back together as a string
text = strings.Join(lines, "\n")

for _, line := range lines {
if len(line) == 0 {
continue
}

countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
delimiters := []rune{',', '\t', ';', '|', '@'}
validDelim := delimiters[0]
validDelimColCount := 0
for _, delim := range delimiters {
csvReader := stdcsv.NewReader(strings.NewReader(text))
csvReader.Comma = delim
if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount {
validDelim = delim
validDelimColCount = len(rows[0])
}
}

return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
return validDelim
}

// FormatError converts csv errors into readable messages.
func FormatError(err error, locale translation.Locale) (string, error) {
var perr *stdcsv.ParseError
if errors.As(err, &perr) {
if perr, ok := err.(*stdcsv.ParseError); ok {
if perr.Err == stdcsv.ErrFieldCount {
return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
}
281 changes: 240 additions & 41 deletions modules/csv/csv_test.go
Original file line number Diff line number Diff line change
@@ -9,6 +9,8 @@ import (
"strings"
"testing"

"code.gitea.io/gitea/modules/markup"

"github.com/stretchr/testify/assert"
)

@@ -19,67 +21,264 @@ func TestCreateReader(t *testing.T) {

//nolint
func TestCreateReaderAndDetermineDelimiter(t *testing.T) {
var csvToRowsMap = map[string][][]string{
`a;b;c
var cases = []struct {
csv string
expectedRows [][]string
expectedDelimiter rune
}{
// case 0 - semicolon delmited
{
csv: `a;b;c
1;2;3
4;5;6`: {{"a", "b", "c"}, {"1", "2", "3"}, {"4", "5", "6"}},
`col1 col2 col3
a b c
4;5;6`,
expectedRows: [][]string{
{"a", "b", "c"},
{"1", "2", "3"},
{"4", "5", "6"},
},
expectedDelimiter: ';',
},
// case 1 - tab delimited with empty fields
{
csv: `col1 col2 col3
a, b c
e f
g h i
j l
m n
m n,
p q r
u
v w x
y
`: {{"col1", "col2", "col3"},
{"a", "b", "c"},
{"", "e", "f"},
{"g", "h", "i"},
{"j", "", "l"},
{"m", "n", ""},
{"p", "q", "r"},
{"", "", "u"},
{"v", "w", "x"},
{"y", "", ""},
{"", "", ""}},
` col1,col2,col3
`,
expectedRows: [][]string{
{"col1", "col2", "col3"},
{"a,", "b", "c"},
{"", "e", "f"},
{"g", "h", "i"},
{"j", "", "l"},
{"m", "n,", ""},
{"p", "q", "r"},
{"", "", "u"},
{"v", "w", "x"},
{"y", "", ""},
{"", "", ""},
},
expectedDelimiter: '\t',
},
// case 2 - comma delimited with leading spaces
{
csv: ` col1,col2,col3
a, b, c
d,e,f
,h, i
j, ,
, , `: {{"col1", "col2", "col3"},
{"a", "b", "c"},
{"d", "e", "f"},
{"", "h", "i"},
{"j", "", ""},
{"", "", ""}},
, , `,
expectedRows: [][]string{
{"col1", "col2", "col3"},
{"a", "b", "c"},
{"d", "e", "f"},
{"", "h", "i"},
{"j", "", ""},
{"", "", ""},
},
expectedDelimiter: ',',
},
}

for csv, expectedRows := range csvToRowsMap {
rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(csv))
assert.NoError(t, err)
for n, c := range cases {
rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(c.csv))
assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma)
rows, err := rd.ReadAll()
assert.NoError(t, err)
assert.EqualValues(t, rows, expectedRows)
assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n)
}
}

func TestDetermineDelimiter(t *testing.T) {
var cases = []struct {
csv string
filename string
expectedDelimiter rune
}{
// case 0 - semicolon delmited
{
csv: "a",
filename: "test.csv",
expectedDelimiter: ',',
},
// case 1 - single column/row CSV
{
csv: "a",
filename: "",
expectedDelimiter: ',',
},
// case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited)
{
csv: "1,2",
filename: "test.tsv",
expectedDelimiter: '\t',
},
// case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter
{
csv: "1,2",
filename: "",
expectedDelimiter: ',',
},
// case 4 - semi-colon delimited with csv extension
{
csv: "1;2",
filename: "test.csv",
expectedDelimiter: ';',
},
// case 5 - tabbed delimited with tsv extension
{
csv: "1\t2",
filename: "test.tsv",
expectedDelimiter: '\t',
},
// case 6 - tabbed delimited without any filename
{
csv: "1\t2",
filename: "",
expectedDelimiter: '\t',
},
// case 7 - tabs won't work, only commas as every row has same amount of commas
{
csv: "col1,col2\nfirst\tval,seconed\tval",
filename: "",
expectedDelimiter: ',',
},
// case 8 - While looks like comma delimited, has psv extension
{
csv: "1,2",
filename: "test.psv",
expectedDelimiter: '|',
},
// case 9 - pipe delmiited with no extension
{
csv: "1|2",
filename: "",
expectedDelimiter: '|',
},
// case 10 - semi-colon delimited with commas in values
{
csv: "1,2,3;4,5,6;7,8,9\na;b;c",
filename: "",
expectedDelimiter: ';',
},
// case 11 - semi-colon delimited with newline in content
{
csv: `"1,2,3,4";"a
b";%
c;d;#`,
filename: "",
expectedDelimiter: ';',
},
// case 12 - HTML as single value
{
csv: "<br/>",
filename: "",
expectedDelimiter: ',',
},
// case 13 - tab delimited with commas in values
{
csv: `name email note
John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimters`,
filename: "",
expectedDelimiter: '\t',
},
}

for n, c := range cases {
delimiter := determineDelimiter(&markup.RenderContext{Filename: c.filename}, []byte(c.csv))
assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
}
}

func TestGuessDelimiter(t *testing.T) {
var csvToDelimiterMap = map[string]rune{
"a": ',',
"1,2": ',',
"1;2": ';',
"1\t2": '\t',
"1|2": '|',
"1,2,3;4,5,6;7,8,9\na;b;c": ';',
"\"1,2,3,4\";\"a\nb\"\nc;d": ';',
"<br/>": ',',
"name\temail\tnote\nJohn Doe\tjohn@doe.com\tThis,note,had,a,lot,of,commas,to,test,delimters": '\t',
var cases = []struct {
csv string
expectedDelimiter rune
}{
// case 0 - single cell, comma delmited
{
csv: "a",
expectedDelimiter: ',',
},
// case 1 - two cells, comma delimited
{
csv: "1,2",
expectedDelimiter: ',',
},
// case 2 - semicolon delimited
{
csv: "1;2",
expectedDelimiter: ';',
},
// case 3 - tab delimited
{
csv: "1 2",
expectedDelimiter: '\t',
},
// case 4 - pipe delimited
{
csv: "1|2",
expectedDelimiter: '|',
},
// case 5 - semicolon delimited with commas in text
{
csv: `1,2,3;4,5,6;7,8,9
a;b;c`,
expectedDelimiter: ';',
},
// case 6 - semicolon delmited with commas in quoted text
{
csv: `"1,2,3,4";"a
b"
c;d`,
expectedDelimiter: ';',
},
// case 7 - HTML
{
csv: "<br/>",
expectedDelimiter: ',',
},
// case 8 - tab delimited with commas in value
{
csv: `name email note
John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimters`,
expectedDelimiter: '\t',
},
// case 9 - tab delimited with new lines in values, commas in values
{
csv: `1 "some,\"more
\"
quoted,
text," a
2 "some,
quoted,
text," b
3 "some,
quoted,
text" c
4 "some,
quoted,
text," d`,
expectedDelimiter: '\t',
},
// case 10 - semicolon delmited with quotes and semicolon in value
{
csv: `col1;col2
"this has a literal "" in the text";"and an ; in the text"`,
expectedDelimiter: ';',
},
}

for csv, expectedDelimiter := range csvToDelimiterMap {
assert.EqualValues(t, guessDelimiter([]byte(csv)), expectedDelimiter)
for n, c := range cases {
delimiter := guessDelimiter([]byte(c.csv))
assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
}

}