Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Generate tree-sitter grammar from parser #1923

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler/esy.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "0.5.13",
"esy": {
"build": [
"dune build @native --no-buffer"
"dune build @native @install --no-buffer"
],
"buildEnv": {
"DUNE_BUILD_DIR": "#{self.target_dir}",
Expand Down
20 changes: 20 additions & 0 deletions compiler/src/dune
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
(library
(name grain)
(public_name grain)
(modules compile)
(libraries cmdliner compiler-libs.common grain_codegen grain_linking
grain_middle_end grain_parsing grain_typed grain_utils grain_diagnostics
ppx_sexp_conv.runtime-lib sexplib)
(preprocess
(pps ppx_sexp_conv)))

(rule
(action
(with-stdout-to
tree-sitter.json
(run %{dep:tree_sitter.exe}))))

(install
(section share)
(files tree-sitter.json)
(package grain))

(executable
(name tree_sitter)
(modules tree_sitter)
(libraries menhirSdk grain_parsing ppx_deriving_yojson.runtime yojson
grain_utils)
(preprocess
(pps ppx_sexp_conv ppx_deriving_yojson sedlex.ppx)))

(install
(section lib)
(files)
Expand Down
4 changes: 2 additions & 2 deletions compiler/src/parsing/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(menhir
(modules parser)
(flags --explain --unused-tokens --strict))
(flags --explain --unused-tokens --strict --cmly))

;; The following two rules create a copy of the file parser.mly named
;; unitActionsParser.mly. This is a copy of the grammar where the semantic
Expand All @@ -20,7 +20,7 @@
(menhir
(modules unitActionsParser)
(flags --table --external-tokens Parser --unused-tokens
--unused-precedence-levels))
--unused-precedence-levels --cmly))

;; This rule compiles the parser.messages file into OCaml code.

Expand Down
3 changes: 1 addition & 2 deletions compiler/src/parsing/lexer.re
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,14 @@ let dec_float_integral = [%sedlex.regexp?
];
let dec_float_alphabetic = [%sedlex.regexp? "Infinity" | "NaN"];

let dec_float = [%sedlex.regexp?
let unsigned_float = [%sedlex.regexp?
(hex_float_integral, hex_float_decimal, hex_float_exp) |
(hex_int, hex_float_exp) |
(dec_float_integral, dec_float_decimal, Opt(dec_float_exp)) |
(dec_float_integral, dec_float_exp) |
dec_float_alphabetic
];

let unsigned_float = [%sedlex.regexp? dec_float];
let invalid_float = [%sedlex.regexp?
(dec_float_decimal, Opt(dec_float_exp))
];
Expand Down
59 changes: 31 additions & 28 deletions compiler/src/parsing/parser.mly
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,43 @@ include Parser_header
module Grain_parsing = struct end
%}


%token <string> RATIONAL
%token <string> NUMBER_INT NUMBER_FLOAT
%token <string> INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 FLOAT32 FLOAT64 BIGINT
%token <string> WASMI32 WASMI64 WASMF32 WASMF64
%token <string> LIDENT UIDENT
%token <string> STRING BYTES CHAR
%token LBRACK LBRACKRCARET RBRACK LPAREN RPAREN LBRACE RBRACE LCARET RCARET
%token COMMA SEMI AS
%token THICKARROW ARROW
%token EQUAL GETS
%token UNDERSCORE
%token COLON QUESTION DOT ELLIPSIS

%token ASSERT FAIL EXCEPTION THROW

%token TRUE FALSE VOID

%token LET MUT REC IF WHEN ELSE MATCH WHILE FOR CONTINUE BREAK RETURN
(* Regex patterns are used by tooling like tree-sitter; tokens consumed by the parser are produced by the lexer *)
%token <string> RATIONAL [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)/\\-?([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)r"]
%token <string> NUMBER_INT [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)"]
%token <string> NUMBER_FLOAT [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)"]
%token <string> INT8 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)s"] INT16 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)S"] INT32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)l"] INT64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)L"] UINT8 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)us"] UINT16 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)uS"] UINT32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)ul"] UINT64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)uL"] BIGINT [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)t"]
%token <string> FLOAT32 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)f"] FLOAT64 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)d"]
%token <string> WASMI32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)n"] WASMI64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)N"]
%token <string> WASMF32 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)w"] WASMF64 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)W"]
%token <string> LIDENT [@pattern "[\\p{XID_Start}--\\p{Lu}_]\\p{XID_Continue}*"] UIDENT [@pattern "\\p{XID_Start}&&\\p{Lu}\\p{XID_Continue}*"]
%token <string> STRING [@pattern "\"(\\\"|[^\"])*\""] BYTES [@pattern "b\"(\\\"|[^\"])*\""] CHAR [@pattern "'(\\'|[^'])*'"]
%token LBRACK [@pattern "\\["] LBRACKRCARET [@pattern "\\[<"] RBRACK [@pattern "\\]"] LPAREN [@pattern "\\("] RPAREN [@pattern "\\)"] LBRACE [@pattern "\\{"] RBRACE [@pattern "\\}"] LCARET [@pattern "<"] RCARET [@pattern ">"]
%token COMMA [@pattern ","] SEMI [@pattern ";"] AS [@pattern "as"]
%token THICKARROW [@pattern "=>"] ARROW [@pattern "->"]
%token EQUAL [@pattern "="] GETS [@pattern ":="]
%token UNDERSCORE [@pattern "_"]
%token COLON [@pattern ":"] QUESTION [@pattern "\\?"] DOT [@pattern "\\."] ELLIPSIS [@pattern "\\.\\.\\."]

%token ASSERT [@pattern "assert"] FAIL [@pattern "fail"] EXCEPTION [@pattern "exception"] THROW [@pattern "throw"]

%token TRUE [@pattern "true"] FALSE [@pattern "false"] VOID [@pattern "void"]

%token LET [@pattern "let"] MUT [@pattern "mut"] REC [@pattern "rec"] IF [@pattern "if"] WHEN [@pattern "when"] ELSE [@pattern "else"] MATCH [@pattern "match"] WHILE [@pattern "while"] FOR [@pattern "for"] CONTINUE [@pattern "continue"] BREAK [@pattern "break"] RETURN [@pattern "return"]
%token AT

%token <string> INFIX_10 INFIX_30 INFIX_40 INFIX_50 INFIX_60 INFIX_70
%token <string> INFIX_80 INFIX_90 INFIX_100 INFIX_110 INFIX_120
%token <string> PREFIX_150
%token <string> INFIX_ASSIGNMENT_10
%token <string> INFIX_10 INFIX_30 [@pattern "(\\|\\||\\?\\?)[$&*/+=><^|!?%:.-]*"] INFIX_40 [@pattern "&&[$&*/+=><^|!?%:.-]*"] INFIX_50 [@pattern "\\|[$&*/+=><^|!?%:.-]*"] INFIX_60 [@pattern "\\^[$&*/+=><^|!?%:.-]*"] INFIX_70 [@pattern "&[$&*/+=><^|!?%:.-]*"]
%token <string> INFIX_80 [@pattern "(is|isnt|(==|!=)[$&*/+=><^|!?%:.-]*)"] INFIX_90 [@pattern "(<[$&*/+=>^|!?%:.-][$&*/+=><^|!?%:.-]*|>[$&*/+=<^|!?%:.-][$&*/+=><^|!?%:.-]*)"] INFIX_100 [@pattern "(<<[$&*/+=><^|!?%:.-]*|>>[$&*/+=<^|!?%:.-][$&*/+=><^|!?%:.-]*)"] INFIX_110 [@pattern "(\\+|\\-)[$&*/+=><^|!?%:.-]*"] INFIX_120 [@pattern "((\\*|%)[$&*/+=><^|!?%:.-]*|/[$&+=><^|!?%:.-][$&*/+=><^|!?%:.-]*)"]
%token <string> PREFIX_150 [@pattern "!"]
%token <string> INFIX_ASSIGNMENT_10 [@pattern "(\\+=|\\-=|\\*=|/=|%=)"]

%token ENUM RECORD TYPE MODULE INCLUDE USE PROVIDE ABSTRACT FOREIGN WASM PRIMITIVE
%token AND
%token EXCEPT FROM STAR
%token SLASH DASH PIPE
%token ENUM [@pattern "enum"] RECORD [@pattern "record"] TYPE [@pattern "type"] MODULE [@pattern "module"] INCLUDE [@pattern "include"] USE [@pattern "use"] PROVIDE [@pattern "provide"] ABSTRACT [@pattern "abstract"] FOREIGN [@pattern "foreign"] WASM [@pattern "wasm"] PRIMITIVE [@pattern "primitive"]
%token AND [@pattern "and"]
%token EXCEPT [@pattern "except"] FROM [@pattern "from"] STAR [@pattern "\\*"]
%token SLASH [@pattern "\\\\"] DASH [@pattern "-"] PIPE [@pattern "\\|"]
%token EOL EOF

// reserved tokens
%token TRY CATCH COLONCOLON MACRO YIELD
%token TRY [@pattern "try"] CATCH [@pattern "catch"] COLONCOLON [@pattern "::"] MACRO [@pattern "macro"] YIELD [@pattern "yield"]

// Not a real token, this is injected by the lexer
%token FUN
Expand Down
123 changes: 123 additions & 0 deletions compiler/src/tree_sitter.re
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Referencing https://github.com/marceline-cramer/tree-sitter-grain/blob/main/src/grammar.json to compare output

// MenhirSdk API: https://gitlab.inria.fr/fpottier/menhir/-/blob/master/sdk/cmly_api.ml
module Grammar =
MenhirSdk.Cmly_read.Read({
let filename = "parsing/parser.cmly";
});

// https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/parse_grammar.rs#L10
// Also useful: https://tree-sitter.github.io/tree-sitter/creating-parsers#the-grammar-dsl
type tree_sitter_node =
| String({value: string})
| Pattern({value: string});

// Needs to map to JSONSchema defined at https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/grammar-schema.json
let yojson_of_tree_sitter_node = (node: tree_sitter_node) => {
switch (node) {
| String({value}) =>
`Assoc([("type", `String("STRING")), ("value", `String(value))])
| Pattern({value}) =>
`Assoc([
("type", `String("PATTERN")),
("value", `String(value)),
("flags", `String("v")),
])
};
};

module StringMap =
Map.Make({
type t = string;
let compare = compare;
});

let yojson_of_stringmap = (m: StringMap.t(tree_sitter_node)) => {
let a =
StringMap.bindings(m)
|> List.map(((key, node)) => (key, yojson_of_tree_sitter_node(node)));
`Assoc(a);
};

// https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/parse_grammar.rs#L75
[@deriving to_yojson]
type grammar_json = {
name: string,
[@to_yojson yojson_of_stringmap]
rules: StringMap.t(tree_sitter_node),
// #[serde(default)]
// precedences: Vec<Vec<RuleJSON>>,
// #[serde(default)]
// conflicts: Vec<Vec<String>>,
// #[serde(default)]
// externals: Vec<RuleJSON>,
// #[serde(default)]
// extras: Vec<RuleJSON>,
// #[serde(default)]
// inline: Vec<String>,
// #[serde(default)]
// supertypes: Vec<String>,
// word: Option<String>,
};

let _ = {
// List.iter(
// ((a, b, c)) => {
// // hold
// print_endline(Grammar.Nonterminal.name(a));
// print_endline(Grammar.Nonterminal.name(Grammar.Production.lhs(b)));
// Array.iter(
// ((sym, id, attrs)) => {print_endline(id)},
// Grammar.Production.rhs(b),
// );
// },
// Grammar.Grammar.entry_points,
// );
// Grammar.Nonterminal.iter(nt => {
// print_endline(Grammar.Nonterminal.name(nt))
// });

let grammar: grammar_json =
Grammar.Terminal.fold(
(t, acc) => {
switch (Grammar.Terminal.kind(t)) {
| `REGULAR =>
let name = Grammar.Terminal.name(t);
let node =
List.find_map(
attr => {
switch (Grammar.Attribute.label(attr)) {
| "pattern" =>
let pattern = Grammar.Attribute.payload(attr);
let pattern =
Grain_utils.String_utils.slice(
~first=1,
~last=-1,
pattern,
);
let pattern = Scanf.unescaped(pattern);
Some(Pattern({value: pattern}));
| _ => None
}
},
Grammar.Terminal.attributes(t),
);
switch (node) {
| Some(node) => {
...acc,
rules: StringMap.add(name, node, acc.rules),
}
// TODO: Throw in the future
| None => acc
};
| `ERROR => acc
| `PSEUDO => acc
| `EOF => acc
}
},
{name: "grain", rules: StringMap.empty},
);
print_endline(
Yojson.Safe.pretty_to_string(grammar_json_to_yojson(grammar)),
);
};
Loading