diff --git a/README.md b/README.md index e73d865..e736091 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ First, define your lexical specification in JSON format. As an example, let's wr ```json { + "name": "statement", "entries": [ { "kind": "whitespace", @@ -43,14 +44,14 @@ First, define your lexical specification in JSON format. As an example, let's wr } ``` -Save the above specification to a file in UTF-8. In this explanation, the file name is lexspec.json. +Save the above specification to a file in UTF-8. In this explanation, the file name is `statement.json`. ### 2. Compile the lexical specification Next, generate a DFA from the lexical specification using `maleeni compile` command. ```sh -$ maleeni compile -l lexspec.json -o clexspec.json +$ maleeni compile -l statement.json -o statementc.json ``` ### 3. Debug (Optional) @@ -60,7 +61,7 @@ If you want to make sure that the lexical specification behaves as expected, you ⚠️ An encoding that `maleeni lex` and the driver can handle is only UTF-8. ```sh -$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_name, .lexeme, .eof] | @csv' +$ echo -n 'The truth is out there.' | maleeni lex statementc.json | jq -r '[.kind_name, .lexeme, .eof] | @csv' "word","The",false "whitespace"," ",false "word","truth",false @@ -94,10 +95,10 @@ The JSON format of tokens that `maleeni lex` command prints is as follows: Using `maleeni-go` command, you can generate a source code of the lexer to recognize your lexical specification. ```sh -$ maleeni-go clexspec.json > lexer.go +$ maleeni-go statementc.json ``` -The above command generates the lexer and saves it to `lexer.go` file. To use the lexer, you need to call `NewLexer` function defined in `lexer.go`. The following code is a simple example. In this example, the lexer reads a source code from stdin and writes the result, tokens, to stdout. +The above command generates the lexer and saves it to `statement_lexer.go` file. By default, the file name will be `{spec name}_lexer.json`. To use the lexer, you need to call `NewLexer` function defined in `statement_lexer.go`. The following code is a simple example. In this example, the lexer reads a source code from stdin and writes the result, tokens, to stdout. ```go package main @@ -136,14 +137,14 @@ Please save the above source code to `main.go` and create a directory structure ``` /project_root -├── lexer.go ... Lexer generated from the compiled lexical specification (the result of `maleeni-go`). -└── main.go .... Caller of the lexer. +├── statement_lexer.go ... Lexer generated from the compiled lexical specification (the result of `maleeni-go`). +└── main.go .............. Caller of the lexer. ``` Now, you can perform the lexical analysis. ```sh -$ echo -n 'I want to believe.' | go run main.go lexer.go +$ echo -n 'I want to believe.' | go run main.go statement_lexer.go valid: word: 'I' valid: whitespace: ' ' valid: word: 'want' @@ -164,8 +165,9 @@ The lexical specification format to be passed to `maleeni compile` command is as top level object: -| Field | Type | Nullable | Description | -|---------|------------------------|----------|-----------------------------------------------------------------------------------------------------------------------| +| Field | Type | Nullable | Description | +|---------|------------------------|----------|---------------------------------------------------------------------------------------------------------------------------| +| name | string | false | A specification name. | | entries | array of entry objects | false | An array of entries sorted by priority. The first element has the highest priority, and the last has the lowest priority. | entry object: @@ -292,6 +294,7 @@ For instance, you can define [an identifier of golang](https://golang.org/ref/sp ```json { + "name": "id", "entries": [ { "fragment": true, @@ -326,6 +329,7 @@ For instance, you can define a subset of [the string literal of golang](https:// ```json { + "name": "string", "entries": [ { "kind": "string_open", @@ -369,7 +373,7 @@ For instance, you can define a subset of [the string literal of golang](https:// In the above specification, when the `"` mark appears in default mode (it's the initial mode), the driver transitions to the `string` mode and interprets character sequences (`char_seq`) and escape sequences (`escaped_char`). When the `"` mark appears the next time, the driver returns to the `default` mode. ```sh -$ echo -n '"foo\nbar"foo' | maleeni lex go-string-cspec.json | jq -r '[.mode_name, .kind_name, .lexeme, .eof] | @csv' +$ echo -n '"foo\nbar"foo' | maleeni lex stringc.json | jq -r '[.mode_name, .kind_name, .lexeme, .eof] | @csv' "default","string_open","""",false "string","char_seq","foo",false "string","escaped_char","\n",false diff --git a/cmd/maleeni-go/generate.go b/cmd/maleeni-go/generate.go index d37defd..d31daed 100644 --- a/cmd/maleeni-go/generate.go +++ b/cmd/maleeni-go/generate.go @@ -24,13 +24,14 @@ func Execute() error { var generateFlags = struct { pkgName *string + output *string }{} var generateCmd = &cobra.Command{ Use: "maleeni-go", Short: "Generate a lexer for Go", Long: `maleeni-go generates a lexer for Go. The lexer recognizes the lexical specification specified as the argument.`, - Example: ` maleeni-go clexspec.json > lexer.go`, + Example: ` maleeni-go clexspec.json`, Args: cobra.ExactArgs(1), RunE: runGenerate, SilenceErrors: true, @@ -39,6 +40,7 @@ var generateCmd = &cobra.Command{ func init() { generateFlags.pkgName = generateCmd.Flags().StringP("package", "p", "main", "package name") + generateFlags.output = generateCmd.Flags().StringP("output", "o", "", "output file path") } func runGenerate(cmd *cobra.Command, args []string) (retErr error) { @@ -47,7 +49,30 @@ func runGenerate(cmd *cobra.Command, args []string) (retErr error) { return fmt.Errorf("Cannot read a compiled lexical specification: %w", err) } - return driver.GenLexer(clspec, *generateFlags.pkgName) + b, err := driver.GenLexer(clspec, *generateFlags.pkgName) + if err != nil { + return fmt.Errorf("Failed to generate a lexer: %v", err) + } + + var filePath string + if *generateFlags.output != "" { + filePath = *generateFlags.output + } else { + filePath = fmt.Sprintf("%v_lexer.go", clspec.Name) + } + + f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("Failed to create an output file: %v", err) + } + defer f.Close() + + _, err = f.Write(b) + if err != nil { + return fmt.Errorf("Failed to write lexer source code: %v", err) + } + + return nil } func readCompiledLexSpec(path string) (*spec.CompiledLexSpec, error) { diff --git a/compiler/compiler.go b/compiler/compiler.go index 5d1a1d5..0c89737 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -106,6 +106,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } return &spec.CompiledLexSpec{ + Name: lexspec.Name, InitialModeID: spec.LexModeIDDefault, ModeNames: modeNames, KindNames: kindNames, diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go index c76bb24..456920f 100644 --- a/compiler/compiler_test.go +++ b/compiler/compiler_test.go @@ -18,6 +18,7 @@ func TestCompile(t *testing.T) { Caption: "allow duplicates names between fragments and non-fragments", Spec: ` { + "name": "test", "entries": [ { "kind": "a2z", @@ -36,6 +37,7 @@ func TestCompile(t *testing.T) { Caption: "don't allow duplicates names in non-fragments", Spec: ` { + "name": "test", "entries": [ { "kind": "a2z", @@ -54,6 +56,7 @@ func TestCompile(t *testing.T) { Caption: "don't allow duplicates names in fragments", Spec: ` { + "name": "test", "entries": [ { "kind": "a2z", diff --git a/driver/lexer_test.go b/driver/lexer_test.go index a742bad..8af3817 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -103,6 +103,7 @@ func TestLexer_Next(t *testing.T) { }{ { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "(a|b)*abb"), newLexEntryDefaultNOP("t2", " +"), @@ -126,6 +127,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "b?a+"), newLexEntryDefaultNOP("t2", "(ab)?(cd)+"), @@ -154,6 +156,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "."), }, @@ -198,6 +201,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"), }, @@ -220,6 +224,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // all 1 byte characters except null character (U+0000) // @@ -246,6 +251,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // all 2 byte characters newLexEntryDefaultNOP("char2Byte", "[\xc2\x80-\xdf\xbf]"), @@ -267,6 +273,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntryDefaultNOP("char3Byte", "[\xe0\xa0\x80-\xe0\xa0\x80]"), @@ -282,6 +289,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // The first two bytes are the same. newLexEntryDefaultNOP("char3Byte", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), @@ -303,6 +311,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntryDefaultNOP("char3Byte", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), @@ -324,6 +333,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // all 3 byte characters newLexEntryDefaultNOP("char3Byte", "[\xe0\xa0\x80-\xef\xbf\xbf]"), @@ -369,6 +379,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntryDefaultNOP("char4Byte", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), @@ -384,6 +395,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // The first 3 bytes are the same. newLexEntryDefaultNOP("char4Byte", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), @@ -405,6 +417,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // The first 2 bytes are the same. newLexEntryDefaultNOP("char4Byte", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), @@ -426,6 +439,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntryDefaultNOP("char4Byte", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), @@ -447,6 +461,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // all 4 byte characters newLexEntryDefaultNOP("char4Byte", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), @@ -484,6 +499,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"), }, @@ -496,6 +512,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("char1Byte", "\\u{006E}"), newLexEntryDefaultNOP("char2Byte", "\\u{03BD}"), @@ -514,6 +531,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, @@ -529,6 +547,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "\\f{a2c}\\f{d2f}+"), newLexEntryFragment("a2c", "abc"), @@ -544,6 +563,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "(\\f{a2c}|\\f{d2f})+"), newLexEntryFragment("a2c", "abc"), @@ -558,6 +578,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "\\f{a2c_or_d2f}+"), newLexEntryFragment("a2c_or_d2f", "\\f{a2c}|\\f{d2f}"), @@ -573,6 +594,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("white_space", ` *`), newLexEntry([]string{"default"}, "string_open", `"`, "string", false), @@ -598,6 +620,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ // `white_space` is enabled in multiple modes. newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false), @@ -623,6 +646,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), newLexEntry([]string{"default"}, "char", `.`, "", false), @@ -671,6 +695,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), newLexEntry([]string{"default"}, "char", `.`, "", false), @@ -710,6 +735,7 @@ func TestLexer_Next(t *testing.T) { }, { lspec: &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("dot", spec.EscapePattern(`.`)), newLexEntryDefaultNOP("star", spec.EscapePattern(`*`)), @@ -778,6 +804,7 @@ func TestLexer_Next(t *testing.T) { func TestLexer_Next_WithPosition(t *testing.T) { lspec := &spec.LexSpec{ + Name: "test", Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("newline", `\u{000A}+`), newLexEntryDefaultNOP("any", `.`), diff --git a/driver/template.go b/driver/template.go index 2772135..d2772ae 100644 --- a/driver/template.go +++ b/driver/template.go @@ -1,13 +1,13 @@ package driver import ( + "bytes" _ "embed" "fmt" "go/ast" "go/format" "go/parser" "go/token" - "os" "strings" "text/template" @@ -17,19 +17,19 @@ import ( //go:embed lexer.go var lexerCoreSrc string -func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { +func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) ([]byte, error) { var lexerSrc string { fset := token.NewFileSet() f, err := parser.ParseFile(fset, "lexer.go", lexerCoreSrc, parser.ParseComments) if err != nil { - return err + return nil, err } var b strings.Builder err = format.Node(&b, fset, f) if err != nil { - return err + return nil, err } lexerSrc = b.String() @@ -100,7 +100,7 @@ func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { { t, err := template.New("").Funcs(genTemplateFuncs(clspec)).Parse(lexSpecTemplate) if err != nil { - return err + return nil, err } var b strings.Builder @@ -112,7 +112,7 @@ func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { "compressionLevel": clspec.CompressionLevel, }) if err != nil { - return err + return nil, err } specSrc = b.String() @@ -136,7 +136,7 @@ func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { t, err := template.New("").Parse(tmpl) if err != nil { - return err + return nil, err } var b strings.Builder @@ -149,7 +149,7 @@ func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { "specSrc": specSrc, }) if err != nil { - return err + return nil, err } src = b.String() @@ -158,12 +158,18 @@ func GenLexer(clspec *spec.CompiledLexSpec, pkgName string) error { fset := token.NewFileSet() f, err := parser.ParseFile(fset, "", src, parser.ParseComments) if err != nil { - return err + return nil, err } f.Name = ast.NewIdent(pkgName) - return format.Node(os.Stdout, fset, f) + var b bytes.Buffer + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + return b.Bytes(), nil } const lexSpecTemplate = ` diff --git a/example/go.json b/example/go.json index 631313d..bf92717 100644 --- a/example/go.json +++ b/example/go.json @@ -1,4 +1,5 @@ { + "name": "go", "entries": [ { "kind": "line_comment_open", diff --git a/spec/spec.go b/spec/spec.go index 62acfc4..2360201 100644 --- a/spec/spec.go +++ b/spec/spec.go @@ -157,10 +157,16 @@ func (e *LexEntry) validate() error { } type LexSpec struct { + Name string `json:"name"` Entries []*LexEntry `json:"entries"` } func (s *LexSpec) Validate() error { + err := validateIdentifier(s.Name) + if err != nil { + return fmt.Errorf("invalid specification name: %v", err) + } + if len(s.Entries) <= 0 { return fmt.Errorf("the lexical specification must have at least one entry") } @@ -364,6 +370,7 @@ type CompiledLexModeSpec struct { } type CompiledLexSpec struct { + Name string `json:"name"` InitialModeID LexModeID `json:"initial_mode_id"` ModeNames []LexModeName `json:"mode_names"` KindNames []LexKindName `json:"kind_names"`