Skip to content

Commit

Permalink
Add CSV lexer (#1005)
Browse files Browse the repository at this point in the history
  • Loading branch information
walles authored Nov 10, 2024
1 parent 5e7b53e commit e76e1e2
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 0 deletions.
53 changes: 53 additions & 0 deletions lexers/embedded/csv.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<!--
Lexer for RFC-4180 compliant CSV subject to the following additions:
- UTF-8 encoding is accepted (the RFC requires 7-bit ASCII)
- The line terminator character can be LF or CRLF (the RFC allows CRLF only)
Link to the RFC-4180 specification: https://tools.ietf.org/html/rfc4180
Additions inspired by:
https://github.com/frictionlessdata/datapackage/issues/204#issuecomment-193242077
Future improvements:
- Identify non-quoted numbers as LiteralNumber
- Identify y as an error in "x"y. Currently it's identified as another string
literal.
-->

<lexer>
<config>
<name>CSV</name>
<alias>csv</alias>
<filename>*.csv</filename>
<mime_type>text/csv</mime_type>
</config>
<rules>
<state name="root">
<rule pattern="\r?\n">
<token type="Punctuation" />
</rule>
<rule pattern=",">
<token type="Punctuation" />
</rule>
<rule pattern="&quot;">
<token type="LiteralStringDouble" />
<push state="escaped" />
</rule>
<rule pattern="[^\r\n,]+">
<token type="LiteralString" />
</rule>
</state>
<state name="escaped">
<rule pattern="&quot;&quot;">
<token type="LiteralStringEscape"/>
</rule>
<rule pattern="&quot;">
<token type="LiteralStringDouble" />
<pop depth="1"/>
</rule>
<rule pattern="[^&quot;]+">
<token type="LiteralStringDouble" />
</rule>
</state>
</rules>
</lexer>
2 changes: 2 additions & 0 deletions lexers/testdata/csv.actual
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
hello,"newline:
quote: "" comma: ,",1,"",""stuff,,1.1
17 changes: 17 additions & 0 deletions lexers/testdata/csv.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[
{"type":"LiteralString","value":"hello"},
{"type":"Punctuation","value":","},
{"type":"LiteralStringDouble","value":"\"newline:\nquote: "},
{"type":"LiteralStringEscape","value":"\"\""},
{"type":"LiteralStringDouble","value":" comma: ,\""},
{"type":"Punctuation","value":","},
{"type":"LiteralString","value":"1"},
{"type":"Punctuation","value":","},
{"type":"LiteralStringDouble","value":"\"\""},
{"type":"Punctuation","value":","},
{"type":"LiteralStringDouble","value":"\"\""},
{"type":"LiteralString","value":"stuff"},
{"type":"Punctuation","value":",,"},
{"type":"LiteralString","value":"1.1"},
{"type":"Punctuation","value":"\n"}
]

0 comments on commit e76e1e2

Please sign in to comment.