From 0214e2b98081b72ac028020384620e6507b8091c Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sat, 28 Sep 2024 21:42:39 +0200
Subject: [PATCH 1/9] Add initial CSV test case

To be updated once we have a lexer in place.
---
 lexers/testdata/csv.actual   |  2 ++
 lexers/testdata/csv.expected | 14 ++++++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 lexers/testdata/csv.actual
 create mode 100644 lexers/testdata/csv.expected
diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual
new file mode 100644
index 000000000..4ec567cc9
--- /dev/null
+++ b/lexers/testdata/csv.actual
@@ -0,0 +1,2 @@
+Username, Identifier,First name
+booker12,9012,"Note ""quoted"" is a quoted word"
diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
new file mode 100644
index 000000000..25fcc28da
--- /dev/null
+++ b/lexers/testdata/csv.expected
@@ -0,0 +1,14 @@
+[
+    {"type":"LiteralString", "value": "Username"},
+    {"type":"Punctuation", "value": ","},
+    {"type":"LiteralString", "value":" Identifier"},
+    {"type":"Punctuation", "value": ","},
+    {"type":"LiteralString", "value":"First name"},
+    {"type":"Punctuation", "value":"\n"},
+    {"type":"LiteralString", "value":"booker12"},
+    {"type":"Punctuation", "value":","},
+    {"type":"LiteralNumberInteger", "value":"9012"},
+    {"type":"Punctuation", "value":","},
+    {"type":"LiteralStringDouble", "value":"\"Note \"\"quoted\"\" is a quoted word\""},
+    {"type":"Punctuation", "value":"\n"}
+]
\ No newline at end of file

From 9c408245fac5648e31067d2da0dab51db7dcec66 Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 08:59:09 +0200
Subject: [PATCH 2/9] Make a better test case

I believe this test case contains all corner cases from the spec.
---
 lexers/embedded/csv.xml      | 33 +++++++++++++++++++++++++++++++++
 lexers/testdata/csv.actual   |  4 ++--
 lexers/testdata/csv.expected | 24 +++++++++++-------------
 3 files changed, 46 insertions(+), 15 deletions(-)
 create mode 100644 lexers/embedded/csv.xml

diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml
new file mode 100644
index 000000000..d493be8aa
--- /dev/null
+++ b/lexers/embedded/csv.xml
@@ -0,0 +1,33 @@
+<!--
+Lexer for RFC-4180 compliant CSV subject to the following additions:
+- UTF-8 encoding is accepted (the RFC requires 7-bit ASCII)
+- The line terminator character can be LF or CRLF (the RFC allows CRLF only)
+
+Link to the RFC-4180 specification:
+https://tools.ietf.org/html/rfc4180
+
+Additions inspired by:
+https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077
+
+Future improvements:
+- Identify non-quoted numbers as LiteralNumber
+-->
+
+<lexer>
+    <config>
+        <name>CSV</name>
+        <alias>csv</alias>
+        <filename>*.csv</filename>
+        <mime_type>text/csv</mime_type>
+    </config>
+    <rules>
+        <state name="root">
+            <rule pattern="[^,\n]+">
+                <token type="LiteralString" />
+            </rule>
+            <rule pattern=".">
+                <token type="Punctuation" />
+            </rule>
+        </state>
+    </rules>
+</lexer>
diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual
index 4ec567cc9..4777a6339 100644
--- a/lexers/testdata/csv.actual
+++ b/lexers/testdata/csv.actual
@@ -1,2 +1,2 @@
-Username, Identifier,First name
-booker12,9012,"Note ""quoted"" is a quoted word"
+hello,"newline:
+quote: "" comma: ,",1,"",,1.1
diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index 25fcc28da..45bc4e15c 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -1,14 +1,12 @@
 [
-    {"type":"LiteralString", "value": "Username"},
-    {"type":"Punctuation", "value": ","},
-    {"type":"LiteralString", "value":" Identifier"},
-    {"type":"Punctuation", "value": ","},
-    {"type":"LiteralString", "value":"First name"},
-    {"type":"Punctuation", "value":"\n"},
-    {"type":"LiteralString", "value":"booker12"},
-    {"type":"Punctuation", "value":","},
-    {"type":"LiteralNumberInteger", "value":"9012"},
-    {"type":"Punctuation", "value":","},
-    {"type":"LiteralStringDouble", "value":"\"Note \"\"quoted\"\" is a quoted word\""},
-    {"type":"Punctuation", "value":"\n"}
-]
\ No newline at end of file
+  {"type":"LiteralString", "value": "hello"},
+  {"type":"Punctuation", "value": ","},
+  {"type":"LiteralStringDouble", "value":"\"newline:\nquote: \"\" comma: ,\""},
+  {"type":"Punctuation", "value": ","},
+  {"type":"LiteralString", "value": "1"},
+  {"type":"Punctuation", "value": ","},
+  {"type":"LiteralStringDouble", "value": "\"\""},
+  {"type":"Punctuation", "value": ","},
+  {"type":"Punctuation", "value": ","},
+  {"type":"LiteralString", "value": "1.1"}
+]

From c3312c89dd43c43acfdb671bb115beab89ba926a Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:15:31 +0200
Subject: [PATCH 3/9] Include an error case in the test case

---
 lexers/testdata/csv.actual   | 2 +-
 lexers/testdata/csv.expected | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lexers/testdata/csv.actual b/lexers/testdata/csv.actual
index 4777a6339..f4941483d 100644
--- a/lexers/testdata/csv.actual
+++ b/lexers/testdata/csv.actual
@@ -1,2 +1,2 @@
 hello,"newline:
-quote: "" comma: ,",1,"",,1.1
+quote: "" comma: ,",1,"",""stuff,,1.1
diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index 45bc4e15c..dfdf8f7f6 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -7,6 +7,9 @@
   {"type":"Punctuation", "value": ","},
   {"type":"LiteralStringDouble", "value": "\"\""},
   {"type":"Punctuation", "value": ","},
+  {"type":"LiteralStringDouble", "value": "\"\""},
+  {"type":"Error", "value": "stuff"},
+  {"type":"Punctuation", "value": ","},
   {"type":"Punctuation", "value": ","},
   {"type":"LiteralString", "value": "1.1"}
 ]

From 198668dcdcede3bff0f02f3774e1c3e4b451d4b8 Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:30:11 +0200
Subject: [PATCH 4/9] Try handling double quotes

---
 lexers/embedded/csv.xml      | 25 ++++++++++++++++++++++---
 lexers/testdata/csv.expected |  3 ++-
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml
index d493be8aa..fcee2b64a 100644
--- a/lexers/embedded/csv.xml
+++ b/lexers/embedded/csv.xml
@@ -22,12 +22,31 @@ Future improvements:
     </config>
     <rules>
         <state name="root">
-            <rule pattern="[^,\n]+">
-                <token type="LiteralString" />
+            <rule pattern="\r?\n">
+                <token type="Punctuation" />
             </rule>
-            <rule pattern=".">
+            <rule pattern=",">
                 <token type="Punctuation" />
             </rule>
+            <rule pattern="&quot;">
+                <token type="LiteralStringDouble" />
+                <push state="escaped" />
+            </rule>
+            <rule pattern="[^\r\n,]+">
+                <token type="LiteralString" />
+            </rule>
+        </state>
+        <state name="escaped">
+            <rule pattern="&quot;&quot;">
+                <token type="LiteralStringEscape"/>
+            </rule>
+            <rule pattern="&quot;">
+                <token type="LiteralStringDouble" />
+                <pop />
+            </rule>
+            <rule pattern="[^&quot;]+">
+                <token type="LiteralStringDouble" />
+            </rule>
         </state>
     </rules>
 </lexer>
diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index dfdf8f7f6..f0c5860cd 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -11,5 +11,6 @@
   {"type":"Error", "value": "stuff"},
   {"type":"Punctuation", "value": ","},
   {"type":"Punctuation", "value": ","},
-  {"type":"LiteralString", "value": "1.1"}
+  {"type":"LiteralString", "value": "1.1"},
+  {"type":"Punctuation", "value": "\n"}
 ]

From 35a4d771fe7985063cb27af061471ccfef129f8b Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:39:14 +0200
Subject: [PATCH 5/9] Tune expectation

Both by fixing the indentation to make the test suite happy and by
adding LiteralStringEscape to the expected output.
---
 lexers/testdata/csv.expected | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index f0c5860cd..806a1e64b 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -1,16 +1,18 @@
 [
-  {"type":"LiteralString", "value": "hello"},
-  {"type":"Punctuation", "value": ","},
-  {"type":"LiteralStringDouble", "value":"\"newline:\nquote: \"\" comma: ,\""},
-  {"type":"Punctuation", "value": ","},
-  {"type":"LiteralString", "value": "1"},
-  {"type":"Punctuation", "value": ","},
-  {"type":"LiteralStringDouble", "value": "\"\""},
-  {"type":"Punctuation", "value": ","},
-  {"type":"LiteralStringDouble", "value": "\"\""},
-  {"type":"Error", "value": "stuff"},
-  {"type":"Punctuation", "value": ","},
-  {"type":"Punctuation", "value": ","},
-  {"type":"LiteralString", "value": "1.1"},
-  {"type":"Punctuation", "value": "\n"}
+  {"type":"LiteralString","value":"hello"},
+  {"type":"Punctuation","value":","},
+  {"type":"LiteralStringDouble","value":"\"newline:\nquote: "},
+  {"type":"LiteralStringEscape","value":"\"\""},
+  {"type":"LiteralStringDouble","value":" comma: ,"},
+  {"type":"Punctuation","value":","},
+  {"type":"LiteralString","value":"1"},
+  {"type":"Punctuation","value":","},
+  {"type":"LiteralStringDouble","value":"\"\""},
+  {"type":"Punctuation","value":","},
+  {"type":"LiteralStringDouble","value":"\"\""},
+  {"type":"Error","value":"stuff"},
+  {"type":"Punctuation","value":","},
+  {"type":"Punctuation","value":","},
+  {"type":"LiteralString","value":"1.1"},
+  {"type":"Punctuation","value":"\n"}
 ]

From 06b823b52020d156aa6fc92a4b8f0f66c244d8ec Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:43:14 +0200
Subject: [PATCH 6/9] Fix escaped comma handling

---
 lexers/embedded/csv.xml      | 2 +-
 lexers/testdata/csv.expected | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml
index fcee2b64a..4958dbe71 100644
--- a/lexers/embedded/csv.xml
+++ b/lexers/embedded/csv.xml
@@ -42,7 +42,7 @@ Future improvements:
             </rule>
             <rule pattern="&quot;">
                 <token type="LiteralStringDouble" />
-                <pop />
+                <pop depth="1"/>
             </rule>
             <rule pattern="[^&quot;]+">
                 <token type="LiteralStringDouble" />
diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index 806a1e64b..751d613e5 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -3,7 +3,7 @@
   {"type":"Punctuation","value":","},
   {"type":"LiteralStringDouble","value":"\"newline:\nquote: "},
   {"type":"LiteralStringEscape","value":"\"\""},
-  {"type":"LiteralStringDouble","value":" comma: ,"},
+  {"type":"LiteralStringDouble","value":" comma: ,\""},
   {"type":"Punctuation","value":","},
   {"type":"LiteralString","value":"1"},
   {"type":"Punctuation","value":","},

From 4c32da99cb7638a69db69d356fd757133679f72b Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:45:44 +0200
Subject: [PATCH 7/9] Reality check, this is easier

---
 lexers/testdata/csv.expected | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index 751d613e5..c3e764932 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -10,7 +10,7 @@
   {"type":"LiteralStringDouble","value":"\"\""},
   {"type":"Punctuation","value":","},
   {"type":"LiteralStringDouble","value":"\"\""},
-  {"type":"Error","value":"stuff"},
+  {"type":"LiteralString","value":"stuff"},
   {"type":"Punctuation","value":","},
   {"type":"Punctuation","value":","},
   {"type":"LiteralString","value":"1.1"},

From 26c6f87c52f851b74919a5b2dd61948f497b226d Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:46:56 +0200
Subject: [PATCH 8/9] Maybe this is coalesced somehow?

---
 lexers/testdata/csv.expected | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lexers/testdata/csv.expected b/lexers/testdata/csv.expected
index c3e764932..0a4e12fec 100644
--- a/lexers/testdata/csv.expected
+++ b/lexers/testdata/csv.expected
@@ -11,8 +11,7 @@
   {"type":"Punctuation","value":","},
   {"type":"LiteralStringDouble","value":"\"\""},
   {"type":"LiteralString","value":"stuff"},
-  {"type":"Punctuation","value":","},
-  {"type":"Punctuation","value":","},
+  {"type":"Punctuation","value":",,"},
   {"type":"LiteralString","value":"1.1"},
   {"type":"Punctuation","value":"\n"}
 ]

From 833c9b01a859688e4ca3be5010c8a9acbcd631c8 Mon Sep 17 00:00:00 2001
From: Johan Walles <johan.walles@gmail.com>
Date: Sun, 29 Sep 2024 09:48:54 +0200
Subject: [PATCH 9/9] Improve docs

---
 lexers/embedded/csv.xml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lexers/embedded/csv.xml b/lexers/embedded/csv.xml
index 4958dbe71..b70c2f8b1 100644
--- a/lexers/embedded/csv.xml
+++ b/lexers/embedded/csv.xml
@@ -3,14 +3,15 @@ Lexer for RFC-4180 compliant CSV subject to the following additions:
 - UTF-8 encoding is accepted (the RFC requires 7-bit ASCII)
 - The line terminator character can be LF or CRLF (the RFC allows CRLF only)
 
-Link to the RFC-4180 specification:
-https://tools.ietf.org/html/rfc4180
+Link to the RFC-4180 specification: https://tools.ietf.org/html/rfc4180
 
 Additions inspired by:
 https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077
 
 Future improvements:
 - Identify non-quoted numbers as LiteralNumber
+- Identify y as an error in "x"y. Currently it's identified as another string
+  literal.
 -->
 
 <lexer>