Skip to content

Commit

Permalink
fix(compiler)!: Apply correct rules for parsing Unicode whitespace (g…
Browse files Browse the repository at this point in the history
  • Loading branch information
ospencer authored Feb 1, 2023
1 parent cce2821 commit 4f19d71
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 10 deletions.
38 changes: 28 additions & 10 deletions compiler/src/parsing/lexer.re
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ let collect_comment = (comment_type, source, loc, lexbuf) => {
comments := [comment_type(source, loc), ...comments^];
};

// Grain follows the Unicode properties for programming languages outlined in
// https://unicode.org/reports/tr31/#Pattern_Syntax

let dec_digit = [%sedlex.regexp? '0' .. '9'];
let hex_digit = [%sedlex.regexp? '0' .. '9' | 'A' .. 'F' | 'a' .. 'f'];
let oct_digit = [%sedlex.regexp? '0' .. '7'];
Expand Down Expand Up @@ -118,7 +121,9 @@ let dec_float = [%sedlex.regexp?

let unsigned_float = [%sedlex.regexp? dec_float];

let uident = [%sedlex.regexp? (lu, Star(xid_continue))];
let uident = [%sedlex.regexp?
(Intersect(xid_start, lu), Star(xid_continue))
];
let lident = [%sedlex.regexp?
(Sub(xid_start, lu) | '_', Star(xid_continue))
];
Expand Down Expand Up @@ -150,22 +155,35 @@ let slash_operator_chars = [%sedlex.regexp?
(Sub(operator_char, '/' | '*'), operator_chars)
];

// Tabs and space separators (https://www.compart.com/en/unicode/category/Zs)
let blank = [%sedlex.regexp? Plus(zs | '\t')];

let unicode_esc = [%sedlex.regexp? ("\\u{", Rep(hex_digit, 1 .. 6), "}")];
let unicode4_esc = [%sedlex.regexp? ("\\u", Rep(hex_digit, 4))];
let hex_esc = [%sedlex.regexp? ("\\x", Rep(hex_digit, 1 .. 2))];
let oct_esc = [%sedlex.regexp? ("\\", Rep(oct_digit, 1 .. 3))];
let num_esc = [%sedlex.regexp? unicode_esc | unicode4_esc | hex_esc | oct_esc];

let newline_char = [%sedlex.regexp? "\r\n" | '\n'];
let newline_chars = [%sedlex.regexp?
(Star(newline_char | blank), newline_char)
// Whitespace follows Pattern_White_Space, though we separate spaces from newlines
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Pattern_White_Space=Yes:]

// HORIZONTAL TABULATION
// VERTICAL TABULATION
// SPACE
// LEFT-TO-RIGHT MARK
// RIGHT-TO-LEFT MARK
let blank = [%sedlex.regexp? Plus(0x09 | 0x0B | 0x20 | 0x200E | 0x200F)];

// LINE FEED
// FORM FEED
// CARRIAGE RETURN
// NEXT LINE
// LINE SEPARATOR
// PARAGRAPH SEPARATOR
let newline_char = [%sedlex.regexp?
0x0A | 0x0C | 0x0D | 0x85 | 0x2028 | 0x2029
];
let newlines = [%sedlex.regexp? (Star(newline_char | blank), newline_char)];

let line_comment = [%sedlex.regexp? ("//", Star(Compl('\r' | '\n')))];
let shebang_comment = [%sedlex.regexp? ("#!", Star(Compl('\r' | '\n')))];
let line_comment = [%sedlex.regexp? ("//", Star(Compl(newline_char)))];
let shebang_comment = [%sedlex.regexp? ("#!", Star(Compl(newline_char)))];

let sub_lexeme = (lexbuf, first, last) => {
// We use this implementation over Sedlexing's sub_lexeme since it supports negative indexing
Expand Down Expand Up @@ -205,7 +223,7 @@ let rec token = lexbuf => {
Buffer.add_string(buf, "/**");
read_doc_comment(start_p, buf, lexbuf);
| blank => token(lexbuf)
| newline_chars => positioned(EOL)
| newlines => positioned(EOL)
| (unsigned_float, 'f') => positioned(FLOAT32(sub_lexeme(lexbuf, 0, -1)))
| (unsigned_float, 'd') => positioned(FLOAT64(sub_lexeme(lexbuf, 0, -1)))
| unsigned_float =>
Expand Down
134 changes: 134 additions & 0 deletions compiler/test/suites/parsing.re
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ describe("parsing", ({test, testSkip}) => {
let test_or_skip =
Sys.backend_type == Other("js_of_ocaml") ? testSkip : test;
let assertParse = makeParseRunner(test);
let assertCompileError = makeCompileErrorRunner(test);
let assertFileRun = makeFileRunner(test_or_skip);

// operators
Expand Down Expand Up @@ -237,4 +238,137 @@ describe("parsing", ({test, testSkip}) => {
prog_loc: Location.dummy_loc,
},
);

// Whitespace tests

// Reason does not support OCaml's Unicode escapes, which is why these are
// UTF-8 byte sequences instead of pretty Unicode escapes

assertParse(
"whitespace_1",
// In order,
// HORIZONTAL TABULATION
// VERTICAL TABULATION
// SPACE
// LEFT-TO-RIGHT MARK
// RIGHT-TO-LEFT MARK
// LINE FEED
// FORM FEED
// CARRIAGE RETURN
// NEXT LINE
// LINE SEPARATOR
// PARAGRAPH SEPARATOR
"
module Test
\x09
\x0b
\x20
\xe2\x80\x8e
\xe2\x80\x8f
\x0a
\x0c
\x0d
\xc2\x85
\xe2\x80\xa8
\xe2\x80\xa9
",
{
module_name: Location.mknoloc("Test"),
statements: [],
comments: [],
prog_loc: Location.dummy_loc,
},
);

assertCompileError(
"invalid_whitespace_nbsp",
"\xc2\xa0",
"Grain lexer doesn't recognize this token",
);
assertCompileError(
"invalid_whitespace_emspace",
"\xe2\x80\x83",
"Grain lexer doesn't recognize this token",
);
assertCompileError(
"invalid_whitespace_hairspace",
"\xe2\x80\x8a",
"Grain lexer doesn't recognize this token",
);
assertCompileError(
"invalid_whitespace_ideographicspace",
"\xe3\x80\x80",
"Grain lexer doesn't recognize this token",
);

assertParse(
"end_of_statement_linefeed",
"module Test; a\x0ab",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_formfeed",
"module Test; a\x0cb",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_carriagereturn",
"module Test; a\x0db",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_crlf",
"module Test; a\x0d\x0ab",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_nextline",
"module Test; a\xc2\x85b",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_lineseparator",
"module Test; a\xe2\x80\xa8b",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
assertParse(
"end_of_statement_paragraphseparator",
"module Test; a\xe2\x80\xa9b",
{
module_name: Location.mknoloc("Test"),
statements: [Toplevel.expr(a), Toplevel.expr(b)],
comments: [],
prog_loc: Location.dummy_loc,
},
);
});

0 comments on commit 4f19d71

Please sign in to comment.