Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly determine CSV delimiter #17459

Merged
merged 32 commits into from
Oct 30, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fc8dd28
Fixes #16558 CSV delimiter determiner
richmahn Oct 25, 2021
6fe4f9f
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 25, 2021
c99d94e
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 26, 2021
58e98c5
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 27, 2021
56f0995
Fixes #16558 - properly determine CSV delmiiter
richmahn Oct 27, 2021
7da56ba
Moves quoteString to a new function
richmahn Oct 27, 2021
5b3a8d9
Adds big test with lots of commas for tab delimited csv
richmahn Oct 28, 2021
b0a6290
Adds comments
richmahn Oct 28, 2021
79ee659
Shortens the text of the test
richmahn Oct 28, 2021
f508559
Removes single quotes from regexp as only double quotes need to be se…
richmahn Oct 28, 2021
e0f2862
Fixes spelling
richmahn Oct 28, 2021
ab0583c
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 28, 2021
e466ab7
Fixes check of length as it probalby will only be 1e4, not greater
richmahn Oct 28, 2021
a54e6ac
Merge branch 'richmahn-16558-guess-delimiter' of github.com:richmahn/…
richmahn Oct 28, 2021
62d10e3
Makes sample size a const, properly removes truncated line
richmahn Oct 28, 2021
0a2f06d
Makes sample size a const, properly removes truncated line
richmahn Oct 28, 2021
85eda5f
Fixes comment
richmahn Oct 28, 2021
fc02c05
Fixes comment
richmahn Oct 28, 2021
baf37d3
tests for FormatError() function
richmahn Oct 28, 2021
32a9c36
Adds logic to find the limiter before or after a quoted value
richmahn Oct 28, 2021
e8ac38a
Merge remote-tracking branch 'origin/main' into richmahn-16558-guess-…
richmahn Oct 28, 2021
feabe0e
Simplifies regex
richmahn Oct 28, 2021
f808d73
Merge branch 'main' into richmahn-16558-guess-delimiter
zeripath Oct 29, 2021
0084114
Error tests
richmahn Oct 29, 2021
967da1d
Merge branch 'richmahn-16558-guess-delimiter' of github.com:richmahn/…
richmahn Oct 29, 2021
683def1
Error tests
richmahn Oct 29, 2021
95ede28
Update modules/csv/csv.go
richmahn Oct 30, 2021
b938db4
Update modules/csv/csv.go
richmahn Oct 30, 2021
772095d
Adds comments
richmahn Oct 30, 2021
8a54f7a
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 30, 2021
059ae3d
Update modules/csv/csv.go
zeripath Oct 30, 2021
88a2642
Merge branch 'main' into richmahn-16558-guess-delimiter
wxiaoguang Oct 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 58 additions & 42 deletions modules/csv/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@ package csv
import (
"bytes"
stdcsv "encoding/csv"
"errors"
"io"
"path/filepath"
"regexp"
"strings"

"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/translation"
"code.gitea.io/gitea/modules/util"
)

var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
const maxLines = 10

var quoteRegexp = regexp.MustCompile(`["'](?:[^"'\\]|\\.)*["']`)
zeripath marked this conversation as resolved.
Show resolved Hide resolved

// CreateReader creates a csv.Reader with the given delimiter.
func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
Expand All @@ -30,9 +33,9 @@ func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
return rd
}

// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
// CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
// Reads at most 10k bytes.
func CreateReaderAndGuessDelimiter(rd io.Reader) (*stdcsv.Reader, error) {
func CreateReaderAndDetermineDelimiter(ctx *markup.RenderContext, rd io.Reader) (*stdcsv.Reader, error) {
var data = make([]byte, 1e4)
size, err := util.ReadAtMost(rd, data)
if err != nil {
Expand All @@ -41,59 +44,72 @@ func CreateReaderAndGuessDelimiter(rd io.Reader) (*stdcsv.Reader, error) {

return CreateReader(
io.MultiReader(bytes.NewReader(data[:size]), rd),
guessDelimiter(data[:size]),
determineDelimiter(ctx, data[:size]),
), nil
}

// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
func guessDelimiter(data []byte) rune {
maxLines := 10
text := quoteRegexp.ReplaceAllLiteralString(string(data), "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]

delimiters := []rune{',', ';', '\t', '|', '@'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
// determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter,
// it is used as the delimiter. Otherwise we call guessDelimiter with the data passed
func determineDelimiter(ctx *markup.RenderContext, data []byte) rune {
extension := ".csv"
if ctx != nil {
extension = strings.ToLower(filepath.Ext(ctx.Filename))
}

var delimiter rune
switch extension {
case ".tsv":
delimiter = '\t'
case ".psv":
delimiter = '|'
default:
delimiter = guessDelimiter(data)
}

return bestDelim
return delimiter
}

// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
func scoreDelimiter(lines []string, delim rune) float64 {
countTotal := 0
countLineMax := 0
linesNotEqual := 0
// removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can realiably have each row on one line
zeripath marked this conversation as resolved.
Show resolved Hide resolved
// (quoted strings often have new lines within the string)
func removeQuotedString(text string) string {
return quoteRegexp.ReplaceAllLiteralString(text, "")
}

for _, line := range lines {
if len(line) == 0 {
continue
}
// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
func guessDelimiter(data []byte) rune {
// Removes quoted values so we don't have columns with new lines in them
text := removeQuotedString(string(data))
zeripath marked this conversation as resolved.
Show resolved Hide resolved

// Make the text just be maxLines or less without cut-off lines
lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
if len(lines) > maxLines {
// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
lines = lines[:maxLines]
} else if len(lines) > 1 && len(strings.Join(lines, "\n")) > 1e4 {
// max # of lines of text was somehow > 10k, so probalby the last line was cut off. We remove it so it isn't used, but only if lines > 1
lines = lines[:len(lines)-1]
}

countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
// Put our 1 to 10 lines back together as a string
text = strings.Join(lines, "\n")

delimiters := []rune{',', '\t', ';', '|', '@'}
validDelim := delimiters[0]
validDelimColCount := 0
for _, delim := range delimiters {
csvReader := stdcsv.NewReader(strings.NewReader(text))
csvReader.Comma = delim
if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount {
validDelim = delim
validDelimColCount = len(rows[0])
}
}

return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
return validDelim
}

// FormatError converts csv errors into readable messages.
func FormatError(err error, locale translation.Locale) (string, error) {
var perr *stdcsv.ParseError
if errors.As(err, &perr) {
if perr, ok := err.(*stdcsv.ParseError); ok {
zeripath marked this conversation as resolved.
Show resolved Hide resolved
if perr.Err == stdcsv.ErrFieldCount {
return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
}
Expand Down
Loading