From 0214e2b98081b72ac028020384620e6507b8091c Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sat, 28 Sep 2024 21:42:39 +0200 Subject: [PATCH 1/9] Add initial CSV test case To be updated once we have a lexer in place. --- lexers/testdata/csv.actual | 2 ++ lexers/testdata/csv.expected | 14 ++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 lexers/testdata/csv.actual create mode 100644 lexers/testdata/csv.expected diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual new file mode 100644 index 000000000..4ec567cc9 --- /dev/null +++ b/lexers/testdata/csv.actual @@ -0,0 +1,2 @@ +Username, Identifier,First name +booker12,9012,"Note ""quoted"" is a quoted word" diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected new file mode 100644 index 000000000..25fcc28da --- /dev/null +++ b/lexers/testdata/csv.expected @@ -0,0 +1,14 @@ +[ + {"type":"LiteralString", "value": "Username"}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralString", "value":" Identifier"}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralString", "value":"First name"}, + {"type":"Punctuation", "value":"\n"}, + {"type":"LiteralString", "value":"booker12"}, + {"type":"Punctuation", "value":","}, + {"type":"LiteralNumberInteger", "value":"9012"}, + {"type":"Punctuation", "value":","}, + {"type":"LiteralStringDouble", "value":"\"Note \"\"quoted\"\" is a quoted word\""}, + {"type":"Punctuation", "value":"\n"} +] \ No newline at end of file From 9c408245fac5648e31067d2da0dab51db7dcec66 Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 08:59:09 +0200 Subject: [PATCH 2/9] Make a better test case I believe this test case contains all corner cases from the spec. --- lexers/embedded/csv.xml | 33 +++++++++++++++++++++++++++++++++ lexers/testdata/csv.actual | 4 ++-- lexers/testdata/csv.expected | 24 +++++++++++------------- 3 files changed, 46 insertions(+), 15 deletions(-) create mode 100644 lexers/embedded/csv.xml diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml new file mode 100644 index 000000000..d493be8aa --- /dev/null +++ b/lexers/embedded/csv.xml @@ -0,0 +1,33 @@ + + + + + CSV + csv + *.csv + text/csv + + + + + + + + + + + + diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual index 4ec567cc9..4777a6339 100644 --- a/lexers/testdata/csv.actual +++ b/lexers/testdata/csv.actual @@ -1,2 +1,2 @@ -Username, Identifier,First name -booker12,9012,"Note ""quoted"" is a quoted word" +hello,"newline: +quote: "" comma: ,",1,"",,1.1 diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index 25fcc28da..45bc4e15c 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -1,14 +1,12 @@ [ - {"type":"LiteralString", "value": "Username"}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralString", "value":" Identifier"}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralString", "value":"First name"}, - {"type":"Punctuation", "value":"\n"}, - {"type":"LiteralString", "value":"booker12"}, - {"type":"Punctuation", "value":","}, - {"type":"LiteralNumberInteger", "value":"9012"}, - {"type":"Punctuation", "value":","}, - {"type":"LiteralStringDouble", "value":"\"Note \"\"quoted\"\" is a quoted word\""}, - {"type":"Punctuation", "value":"\n"} -] \ No newline at end of file + {"type":"LiteralString", "value": "hello"}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralStringDouble", "value":"\"newline:\nquote: \"\" comma: ,\""}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralString", "value": "1"}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralStringDouble", "value": "\"\""}, + {"type":"Punctuation", "value": ","}, + {"type":"Punctuation", "value": ","}, + {"type":"LiteralString", "value": "1.1"} +] From c3312c89dd43c43acfdb671bb115beab89ba926a Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:15:31 +0200 Subject: [PATCH 3/9] Include an error case in the test case --- lexers/testdata/csv.actual | 2 +- lexers/testdata/csv.expected | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual index 4777a6339..f4941483d 100644 --- a/lexers/testdata/csv.actual +++ b/lexers/testdata/csv.actual @@ -1,2 +1,2 @@ hello,"newline: -quote: "" comma: ,",1,"",,1.1 +quote: "" comma: ,",1,"",""stuff,,1.1 diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index 45bc4e15c..dfdf8f7f6 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -7,6 +7,9 @@ {"type":"Punctuation", "value": ","}, {"type":"LiteralStringDouble", "value": "\"\""}, {"type":"Punctuation", "value": ","}, + {"type":"LiteralStringDouble", "value": "\"\""}, + {"type":"Error", "value": "stuff"}, + {"type":"Punctuation", "value": ","}, {"type":"Punctuation", "value": ","}, {"type":"LiteralString", "value": "1.1"} ] From 198668dcdcede3bff0f02f3774e1c3e4b451d4b8 Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:30:11 +0200 Subject: [PATCH 4/9] Try handling double quotes --- lexers/embedded/csv.xml | 25 ++++++++++++++++++++++--- lexers/testdata/csv.expected | 3 ++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml index d493be8aa..fcee2b64a 100644 --- a/lexers/embedded/csv.xml +++ b/lexers/embedded/csv.xml @@ -22,12 +22,31 @@ Future improvements: - - + + - + + + + + + + + + + + + + + + + + + + + diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index dfdf8f7f6..f0c5860cd 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -11,5 +11,6 @@ {"type":"Error", "value": "stuff"}, {"type":"Punctuation", "value": ","}, {"type":"Punctuation", "value": ","}, - {"type":"LiteralString", "value": "1.1"} + {"type":"LiteralString", "value": "1.1"}, + {"type":"Punctuation", "value": "\n"} ] From 35a4d771fe7985063cb27af061471ccfef129f8b Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:39:14 +0200 Subject: [PATCH 5/9] Tune expectation Both by fixing the indentation to make the test suite happy and by adding LiteralStringEscape to the expected output. --- lexers/testdata/csv.expected | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index f0c5860cd..806a1e64b 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -1,16 +1,18 @@ [ - {"type":"LiteralString", "value": "hello"}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralStringDouble", "value":"\"newline:\nquote: \"\" comma: ,\""}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralString", "value": "1"}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralStringDouble", "value": "\"\""}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralStringDouble", "value": "\"\""}, - {"type":"Error", "value": "stuff"}, - {"type":"Punctuation", "value": ","}, - {"type":"Punctuation", "value": ","}, - {"type":"LiteralString", "value": "1.1"}, - {"type":"Punctuation", "value": "\n"} + {"type":"LiteralString","value":"hello"}, + {"type":"Punctuation","value":","}, + {"type":"LiteralStringDouble","value":"\"newline:\nquote: "}, + {"type":"LiteralStringEscape","value":"\"\""}, + {"type":"LiteralStringDouble","value":" comma: ,"}, + {"type":"Punctuation","value":","}, + {"type":"LiteralString","value":"1"}, + {"type":"Punctuation","value":","}, + {"type":"LiteralStringDouble","value":"\"\""}, + {"type":"Punctuation","value":","}, + {"type":"LiteralStringDouble","value":"\"\""}, + {"type":"Error","value":"stuff"}, + {"type":"Punctuation","value":","}, + {"type":"Punctuation","value":","}, + {"type":"LiteralString","value":"1.1"}, + {"type":"Punctuation","value":"\n"} ] From 06b823b52020d156aa6fc92a4b8f0f66c244d8ec Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:43:14 +0200 Subject: [PATCH 6/9] Fix escaped comma handling --- lexers/embedded/csv.xml | 2 +- lexers/testdata/csv.expected | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml index fcee2b64a..4958dbe71 100644 --- a/lexers/embedded/csv.xml +++ b/lexers/embedded/csv.xml @@ -42,7 +42,7 @@ Future improvements: - + diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index 806a1e64b..751d613e5 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -3,7 +3,7 @@ {"type":"Punctuation","value":","}, {"type":"LiteralStringDouble","value":"\"newline:\nquote: "}, {"type":"LiteralStringEscape","value":"\"\""}, - {"type":"LiteralStringDouble","value":" comma: ,"}, + {"type":"LiteralStringDouble","value":" comma: ,\""}, {"type":"Punctuation","value":","}, {"type":"LiteralString","value":"1"}, {"type":"Punctuation","value":","}, From 4c32da99cb7638a69db69d356fd757133679f72b Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:45:44 +0200 Subject: [PATCH 7/9] Reality check, this is easier --- lexers/testdata/csv.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index 751d613e5..c3e764932 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -10,7 +10,7 @@ {"type":"LiteralStringDouble","value":"\"\""}, {"type":"Punctuation","value":","}, {"type":"LiteralStringDouble","value":"\"\""}, - {"type":"Error","value":"stuff"}, + {"type":"LiteralString","value":"stuff"}, {"type":"Punctuation","value":","}, {"type":"Punctuation","value":","}, {"type":"LiteralString","value":"1.1"}, From 26c6f87c52f851b74919a5b2dd61948f497b226d Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:46:56 +0200 Subject: [PATCH 8/9] Maybe this is coalesced somehow? --- lexers/testdata/csv.expected | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected index c3e764932..0a4e12fec 100644 --- a/lexers/testdata/csv.expected +++ b/lexers/testdata/csv.expected @@ -11,8 +11,7 @@ {"type":"Punctuation","value":","}, {"type":"LiteralStringDouble","value":"\"\""}, {"type":"LiteralString","value":"stuff"}, - {"type":"Punctuation","value":","}, - {"type":"Punctuation","value":","}, + {"type":"Punctuation","value":",,"}, {"type":"LiteralString","value":"1.1"}, {"type":"Punctuation","value":"\n"} ] From 833c9b01a859688e4ca3be5010c8a9acbcd631c8 Mon Sep 17 00:00:00 2001 From: Johan Walles Date: Sun, 29 Sep 2024 09:48:54 +0200 Subject: [PATCH 9/9] Improve docs --- lexers/embedded/csv.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml index 4958dbe71..b70c2f8b1 100644 --- a/lexers/embedded/csv.xml +++ b/lexers/embedded/csv.xml @@ -3,14 +3,15 @@ Lexer for RFC-4180 compliant CSV subject to the following additions: - UTF-8 encoding is accepted (the RFC requires 7-bit ASCII) - The line terminator character can be LF or CRLF (the RFC allows CRLF only) -Link to the RFC-4180 specification: -https://tools.ietf.org/html/rfc4180 +Link to the RFC-4180 specification: https://tools.ietf.org/html/rfc4180 Additions inspired by: https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077 Future improvements: - Identify non-quoted numbers as LiteralNumber +- Identify y as an error in "x"y. Currently it's identified as another string + literal. -->