diff --git a/compiler/esy.json b/compiler/esy.json index d758fb791f..3a5e922596 100644 --- a/compiler/esy.json +++ b/compiler/esy.json @@ -3,7 +3,7 @@ "version": "0.5.13", "esy": { "build": [ - "dune build @native --no-buffer" + "dune build @native @install --no-buffer" ], "buildEnv": { "DUNE_BUILD_DIR": "#{self.target_dir}", diff --git a/compiler/src/dune b/compiler/src/dune index 8a78f69fa7..907d79ad2d 100644 --- a/compiler/src/dune +++ b/compiler/src/dune @@ -1,12 +1,32 @@ (library (name grain) (public_name grain) + (modules compile) (libraries cmdliner compiler-libs.common grain_codegen grain_linking grain_middle_end grain_parsing grain_typed grain_utils grain_diagnostics ppx_sexp_conv.runtime-lib sexplib) (preprocess (pps ppx_sexp_conv))) +(rule + (action + (with-stdout-to + tree-sitter.json + (run %{dep:tree_sitter.exe})))) + +(install + (section share) + (files tree-sitter.json) + (package grain)) + +(executable + (name tree_sitter) + (modules tree_sitter) + (libraries menhirSdk grain_parsing ppx_deriving_yojson.runtime yojson + grain_utils) + (preprocess + (pps ppx_sexp_conv ppx_deriving_yojson sedlex.ppx))) + (install (section lib) (files) diff --git a/compiler/src/parsing/dune b/compiler/src/parsing/dune index eeb24e8546..05c6d45cc7 100644 --- a/compiler/src/parsing/dune +++ b/compiler/src/parsing/dune @@ -1,6 +1,6 @@ (menhir (modules parser) - (flags --explain --unused-tokens --strict)) + (flags --explain --unused-tokens --strict --cmly)) ;; The following two rules create a copy of the file parser.mly named ;; unitActionsParser.mly. This is a copy of the grammar where the semantic @@ -20,7 +20,7 @@ (menhir (modules unitActionsParser) (flags --table --external-tokens Parser --unused-tokens - --unused-precedence-levels)) + --unused-precedence-levels --cmly)) ;; This rule compiles the parser.messages file into OCaml code. diff --git a/compiler/src/parsing/lexer.re b/compiler/src/parsing/lexer.re index 191604e759..a3cd49ba54 100644 --- a/compiler/src/parsing/lexer.re +++ b/compiler/src/parsing/lexer.re @@ -142,7 +142,7 @@ let dec_float_integral = [%sedlex.regexp? ]; let dec_float_alphabetic = [%sedlex.regexp? "Infinity" | "NaN"]; -let dec_float = [%sedlex.regexp? +let unsigned_float = [%sedlex.regexp? (hex_float_integral, hex_float_decimal, hex_float_exp) | (hex_int, hex_float_exp) | (dec_float_integral, dec_float_decimal, Opt(dec_float_exp)) | @@ -150,7 +150,6 @@ let dec_float = [%sedlex.regexp? dec_float_alphabetic ]; -let unsigned_float = [%sedlex.regexp? dec_float]; let invalid_float = [%sedlex.regexp? (dec_float_decimal, Opt(dec_float_exp)) ]; diff --git a/compiler/src/parsing/parser.mly b/compiler/src/parsing/parser.mly index 47abbf9223..e0f39e941c 100644 --- a/compiler/src/parsing/parser.mly +++ b/compiler/src/parsing/parser.mly @@ -11,40 +11,43 @@ include Parser_header module Grain_parsing = struct end %} - -%token RATIONAL -%token NUMBER_INT NUMBER_FLOAT -%token INT8 INT16 INT32 INT64 UINT8 UINT16 UINT32 UINT64 FLOAT32 FLOAT64 BIGINT -%token WASMI32 WASMI64 WASMF32 WASMF64 -%token LIDENT UIDENT -%token STRING BYTES CHAR -%token LBRACK LBRACKRCARET RBRACK LPAREN RPAREN LBRACE RBRACE LCARET RCARET -%token COMMA SEMI AS -%token THICKARROW ARROW -%token EQUAL GETS -%token UNDERSCORE -%token COLON QUESTION DOT ELLIPSIS - -%token ASSERT FAIL EXCEPTION THROW - -%token TRUE FALSE VOID - -%token LET MUT REC IF WHEN ELSE MATCH WHILE FOR CONTINUE BREAK RETURN +(* Regex patterns are used by tooling like tree-sitter; tokens consumed by the parser are produced by the lexer *) +%token RATIONAL [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)/\\-?([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)r"] +%token NUMBER_INT [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)"] +%token NUMBER_FLOAT [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)"] +%token INT8 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)s"] INT16 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)S"] INT32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)l"] INT64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)L"] UINT8 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)us"] UINT16 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)uS"] UINT32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)ul"] UINT64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)uL"] BIGINT [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)t"] +%token FLOAT32 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)f"] FLOAT64 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)d"] +%token WASMI32 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)n"] WASMI64 [@pattern "([0-9][0-9_]*|0[xX][0-9a-fA-F][0-9a-fA-F_]*|0[oO][0-7][0-7_]*|0[bB][01][01_]*)N"] +%token WASMF32 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)w"] WASMF64 [@pattern "(0[xX][0-9a-fA-F][0-9a-fA-F_]*(\\.[0-9a-fA-F][0-9a-fA-F]*)?[pP][\\+\\-]?[0-9][0-9_]*|[0-9][0-9_]*\\.[0-9][0-9_]*([eE][\\+\\-]?[0-9][0-9_]*)?|[0-9][0-9_]*[eE][\\+\\-]?[0-9][0-9_]*|Infinity|NaN)W"] +%token LIDENT [@pattern "[\\p{XID_Start}--\\p{Lu}_]\\p{XID_Continue}*"] UIDENT [@pattern "\\p{XID_Start}&&\\p{Lu}\\p{XID_Continue}*"] +%token STRING [@pattern "\"(\\\"|[^\"])*\""] BYTES [@pattern "b\"(\\\"|[^\"])*\""] CHAR [@pattern "'(\\'|[^'])*'"] +%token LBRACK [@pattern "\\["] LBRACKRCARET [@pattern "\\[<"] RBRACK [@pattern "\\]"] LPAREN [@pattern "\\("] RPAREN [@pattern "\\)"] LBRACE [@pattern "\\{"] RBRACE [@pattern "\\}"] LCARET [@pattern "<"] RCARET [@pattern ">"] +%token COMMA [@pattern ","] SEMI [@pattern ";"] AS [@pattern "as"] +%token THICKARROW [@pattern "=>"] ARROW [@pattern "->"] +%token EQUAL [@pattern "="] GETS [@pattern ":="] +%token UNDERSCORE [@pattern "_"] +%token COLON [@pattern ":"] QUESTION [@pattern "\\?"] DOT [@pattern "\\."] ELLIPSIS [@pattern "\\.\\.\\."] + +%token ASSERT [@pattern "assert"] FAIL [@pattern "fail"] EXCEPTION [@pattern "exception"] THROW [@pattern "throw"] + +%token TRUE [@pattern "true"] FALSE [@pattern "false"] VOID [@pattern "void"] + +%token LET [@pattern "let"] MUT [@pattern "mut"] REC [@pattern "rec"] IF [@pattern "if"] WHEN [@pattern "when"] ELSE [@pattern "else"] MATCH [@pattern "match"] WHILE [@pattern "while"] FOR [@pattern "for"] CONTINUE [@pattern "continue"] BREAK [@pattern "break"] RETURN [@pattern "return"] %token AT -%token INFIX_10 INFIX_30 INFIX_40 INFIX_50 INFIX_60 INFIX_70 -%token INFIX_80 INFIX_90 INFIX_100 INFIX_110 INFIX_120 -%token PREFIX_150 -%token INFIX_ASSIGNMENT_10 +%token INFIX_10 INFIX_30 [@pattern "(\\|\\||\\?\\?)[$&*/+=><^|!?%:.-]*"] INFIX_40 [@pattern "&&[$&*/+=><^|!?%:.-]*"] INFIX_50 [@pattern "\\|[$&*/+=><^|!?%:.-]*"] INFIX_60 [@pattern "\\^[$&*/+=><^|!?%:.-]*"] INFIX_70 [@pattern "&[$&*/+=><^|!?%:.-]*"] +%token INFIX_80 [@pattern "(is|isnt|(==|!=)[$&*/+=><^|!?%:.-]*)"] INFIX_90 [@pattern "(<[$&*/+=>^|!?%:.-][$&*/+=><^|!?%:.-]*|>[$&*/+=<^|!?%:.-][$&*/+=><^|!?%:.-]*)"] INFIX_100 [@pattern "(<<[$&*/+=><^|!?%:.-]*|>>[$&*/+=<^|!?%:.-][$&*/+=><^|!?%:.-]*)"] INFIX_110 [@pattern "(\\+|\\-)[$&*/+=><^|!?%:.-]*"] INFIX_120 [@pattern "((\\*|%)[$&*/+=><^|!?%:.-]*|/[$&+=><^|!?%:.-][$&*/+=><^|!?%:.-]*)"] +%token PREFIX_150 [@pattern "!"] +%token INFIX_ASSIGNMENT_10 [@pattern "(\\+=|\\-=|\\*=|/=|%=)"] -%token ENUM RECORD TYPE MODULE INCLUDE USE PROVIDE ABSTRACT FOREIGN WASM PRIMITIVE -%token AND -%token EXCEPT FROM STAR -%token SLASH DASH PIPE +%token ENUM [@pattern "enum"] RECORD [@pattern "record"] TYPE [@pattern "type"] MODULE [@pattern "module"] INCLUDE [@pattern "include"] USE [@pattern "use"] PROVIDE [@pattern "provide"] ABSTRACT [@pattern "abstract"] FOREIGN [@pattern "foreign"] WASM [@pattern "wasm"] PRIMITIVE [@pattern "primitive"] +%token AND [@pattern "and"] +%token EXCEPT [@pattern "except"] FROM [@pattern "from"] STAR [@pattern "\\*"] +%token SLASH [@pattern "\\\\"] DASH [@pattern "-"] PIPE [@pattern "\\|"] %token EOL EOF // reserved tokens -%token TRY CATCH COLONCOLON MACRO YIELD +%token TRY [@pattern "try"] CATCH [@pattern "catch"] COLONCOLON [@pattern "::"] MACRO [@pattern "macro"] YIELD [@pattern "yield"] // Not a real token, this is injected by the lexer %token FUN diff --git a/compiler/src/tree_sitter.re b/compiler/src/tree_sitter.re new file mode 100644 index 0000000000..64102b3987 --- /dev/null +++ b/compiler/src/tree_sitter.re @@ -0,0 +1,123 @@ +// Referencing https://github.com/marceline-cramer/tree-sitter-grain/blob/main/src/grammar.json to compare output + +// MenhirSdk API: https://gitlab.inria.fr/fpottier/menhir/-/blob/master/sdk/cmly_api.ml +module Grammar = + MenhirSdk.Cmly_read.Read({ + let filename = "parsing/parser.cmly"; + }); + +// https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/parse_grammar.rs#L10 +// Also useful: https://tree-sitter.github.io/tree-sitter/creating-parsers#the-grammar-dsl +type tree_sitter_node = + | String({value: string}) + | Pattern({value: string}); + +// Needs to map to JSONSchema defined at https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/grammar-schema.json +let yojson_of_tree_sitter_node = (node: tree_sitter_node) => { + switch (node) { + | String({value}) => + `Assoc([("type", `String("STRING")), ("value", `String(value))]) + | Pattern({value}) => + `Assoc([ + ("type", `String("PATTERN")), + ("value", `String(value)), + ("flags", `String("v")), + ]) + }; +}; + +module StringMap = + Map.Make({ + type t = string; + let compare = compare; + }); + +let yojson_of_stringmap = (m: StringMap.t(tree_sitter_node)) => { + let a = + StringMap.bindings(m) + |> List.map(((key, node)) => (key, yojson_of_tree_sitter_node(node))); + `Assoc(a); +}; + +// https://github.com/tree-sitter/tree-sitter/blob/20924fa4cdeb10d82ac308481e39bf8519334e55/cli/src/generate/parse_grammar.rs#L75 +[@deriving to_yojson] +type grammar_json = { + name: string, + [@to_yojson yojson_of_stringmap] + rules: StringMap.t(tree_sitter_node), + // #[serde(default)] + // precedences: Vec>, + // #[serde(default)] + // conflicts: Vec>, + // #[serde(default)] + // externals: Vec, + // #[serde(default)] + // extras: Vec, + // #[serde(default)] + // inline: Vec, + // #[serde(default)] + // supertypes: Vec, + // word: Option, +}; + +let _ = { + // List.iter( + // ((a, b, c)) => { + // // hold + // print_endline(Grammar.Nonterminal.name(a)); + // print_endline(Grammar.Nonterminal.name(Grammar.Production.lhs(b))); + // Array.iter( + // ((sym, id, attrs)) => {print_endline(id)}, + // Grammar.Production.rhs(b), + // ); + // }, + // Grammar.Grammar.entry_points, + // ); + // Grammar.Nonterminal.iter(nt => { + // print_endline(Grammar.Nonterminal.name(nt)) + // }); + + let grammar: grammar_json = + Grammar.Terminal.fold( + (t, acc) => { + switch (Grammar.Terminal.kind(t)) { + | `REGULAR => + let name = Grammar.Terminal.name(t); + let node = + List.find_map( + attr => { + switch (Grammar.Attribute.label(attr)) { + | "pattern" => + let pattern = Grammar.Attribute.payload(attr); + let pattern = + Grain_utils.String_utils.slice( + ~first=1, + ~last=-1, + pattern, + ); + let pattern = Scanf.unescaped(pattern); + Some(Pattern({value: pattern})); + | _ => None + } + }, + Grammar.Terminal.attributes(t), + ); + switch (node) { + | Some(node) => { + ...acc, + rules: StringMap.add(name, node, acc.rules), + } + // TODO: Throw in the future + | None => acc + }; + | `ERROR => acc + | `PSEUDO => acc + | `EOF => acc + } + }, + {name: "grain", rules: StringMap.empty}, + ); + print_endline( + Yojson.Safe.pretty_to_string(grammar_json_to_yojson(grammar)), + ); +};