lark-parser · t-higuchi · Oct 18, 2021 · Oct 18, 2021 · Oct 19, 2021 · Oct 19, 2021
diff --git a/examples/abnf/url_parser.py b/examples/abnf/url_parser.py
@@ -0,0 +1,132 @@
+"""
+Creating URL Parser from ABNF grammar in internet standards (RFC3986)
+==================================================================
+
+Usage:
+ python3 -m examples.abnf.url_parser https://github.com/lark-parser/lark#readme
+ python3 -m examples.abnf.url_parser http://localhost:8000/search?q=lark%2dparser?user=me
+
+It outputs parse tree for an URI passed as first argument.
+
+"""
+import sys
+
+from lark import Lark, Transformer, v_args, Token, Visitor, Tree
+from lark.load_grammar import FromPackageLoader
+
+grammar_in_abnf ="""
+
+%import rfc3986             ; import from examples/grammars/rfc3986.abnf using custom loader
+%import core-rules          ; import from the standard library: ../lark/grammars/core-rules.abnf
+
+; Terminals need to be specified via %terminal directive to control
+; automatic parse-tree construction by lark.
+%terminal ALPHA, DIGIT
+%terminal HEXDIG
+%terminal unreserved
+"""
+
+
+class SimplifyABNFTree_Visitor(Visitor):
+    def __init__(self, unwrap_children=(), keep=(), *args, **kwargs):
+        super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs)
+        self.unwrap = unwrap_children
+        self.keep   = keep
+
+    def visit(self, tree: Tree) -> Tree:
+        # override self.visit(), since _unwrap_and_flatten() assumes top-down visitor
+        self.visit_topdown(tree)
+
+    def _unwrap_and_flatten(self, tree, unwrap_recursive=False):
+        """ a generator to flatten tree into list or tuple """
+        do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False
+
+        for x in tree.children:
+            if isinstance(x, Tree) and do_unwrap:
+                if x.data in self.keep:
+                    yield self._concat_tokens(x, unwrap_recursive=True)
+                else:
+                    for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)):
+                        yield item
+            elif isinstance(x, Token):
+                yield x
+            else:
+                yield x
+
+
+    def _concat_tokens(self, tree, unwrap_recursive=False):
+        """ concatenate multiple tokens in tree.children into single token.
+            leave it as it is if there is a tree in tree.children.
+        """
+        items = [None]
+        words = []
+        children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive))
+
+        for x in children:
+            if isinstance(x, Token):
+                words.append(x.value)
+                if not isinstance(items[-1], Token):
+                    items.append(x)
+            else:
+                if len(words) > 1:
+                    items[-1] = items[-1].update(value=''.join(words))
+                items.append(x)
+                words=[]
+
+        if len(words) > 1:
+            items[-1] = items[-1].update(value=''.join(words))
+
+        tree.children = items[1:]
+        return tree;
+
+    def __default__(self, tree):
+        return self._concat_tokens(tree)
+
+
+class pct_encoded_conv(Transformer):
+    def pct_encoded(self, items): # alias for pct-encoded
+        # items = "%" HEXDIG HEXDIG
+
+        # extract hexadecimal digits, convert it to a character,
+        # then return modified token
+        char_in_hex = ''.join(items[1:])
+        char_ = bytearray.fromhex(char_in_hex).decode()
+        token = items[0].update(value=char_)
+        return token
+
+def main():
+    url = sys.argv[1]
+
+    custom_loader = FromPackageLoader('examples', ('grammars', ))
+    url_parser = Lark(grammar_in_abnf,
+                      # using ABNF grammar
+                      syntax='abnf',
+                      start='URI',
+                      # use earley parser since RFC3986 is too complex for LALR.
+                      parser='earley',
+                      # often needed to set keep_all_tokens=True when ABNF grammar is used.
+                      keep_all_tokens=True,
+                      import_paths=[custom_loader],
+    )
+    tree = url_parser.parse(url)
+
+    # Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters
+    transformer=pct_encoded_conv()
+    tree = transformer.transform(tree)
+
+
+    # We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens
+    # to construct a token that we actually want since many ABNF grammar
+    # in RFCs split every input into too small units like a single character.
+
+    unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name',
+              'segment', 'query', 'fragment',
+              'path_abempty', 'path_absolute', 'path_noscheme', 'path_rootless')
+    simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap)
+    simplifier.visit(tree)
+
+    print(tree.pretty())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/grammars/rfc3986.abnf b/examples/grammars/rfc3986.abnf
@@ -0,0 +1,87 @@
+; ABNF grammar from RFC3986
+;   Uniform Resource Identifier (URI): Generic Syntax
+;
+; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234.
+;
+
+URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+
+hier-part     = "//" authority path-abempty
+                 / path-absolute
+                 / path-rootless
+                 / path-empty
+
+URI-reference = URI / relative-ref
+
+absolute-URI  = scheme ":" hier-part [ "?" query ]
+
+relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
+
+relative-part = "//" authority path-abempty
+                 / path-absolute
+                 / path-noscheme
+                 / path-empty
+
+scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+
+authority     = [ userinfo "@" ] host [ ":" port ]
+userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+host          = IP-literal / IPv4address / reg-name
+port          = *DIGIT
+
+IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
+IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+
+IPv6address   =                            6( h16 ":" ) ls32
+                 /                       "::" 5( h16 ":" ) ls32
+                 / [               h16 ] "::" 4( h16 ":" ) ls32
+                 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+                 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+                 / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+                 / [ *4( h16 ":" ) h16 ] "::"              ls32
+                 / [ *5( h16 ":" ) h16 ] "::"              h16
+                 / [ *6( h16 ":" ) h16 ] "::"
+
+h16           = 1*4HEXDIG
+ls32          = ( h16 ":" h16 ) / IPv4address
+IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
+
+dec-octet     = DIGIT                 ; 0-9
+                 / %x31-39 DIGIT         ; 10-99
+                 / "1" 2DIGIT            ; 100-199
+                 / "2" %x30-34 DIGIT     ; 200-249
+                 / "25" %x30-35          ; 250-255
+
+reg-name      = *( unreserved / pct-encoded / sub-delims )
+
+path          = path-abempty    ; begins with "/" or is empty
+                 / path-absolute   ; begins with "/" but not "//"
+                 / path-noscheme   ; begins with a non-colon segment
+                 / path-rootless   ; begins with a segment
+                 / path-empty      ; zero characters
+
+path-abempty  = *( "/" segment )
+path-absolute = "/" [ segment-nz *( "/" segment ) ]
+path-noscheme = segment-nz-nc *( "/" segment )
+path-rootless = segment-nz *( "/" segment )
+path-empty    = 0<pchar>
+
+
+segment       = *pchar
+segment-nz    = 1*pchar
+segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+                 ; non-zero-length segment without any colon ":"
+
+pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
+
+query         = *( pchar / "/" / "?" )
+fragment      = *( pchar / "/" / "?" )
+
+pct-encoded   = "%" HEXDIG HEXDIG
+
+unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+reserved      = gen-delims / sub-delims
+gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+                 / "*" / "+" / "," / ";" / "="
+
diff --git a/lark/grammars/abnf.lark b/lark/grammars/abnf.lark
@@ -0,0 +1,84 @@
+//
+// Lark's EBNF grammar to parse ABNF grammar (RFC5234)
+//
+
+
+_LPAR:        "("
+_RPAR:        ")"
+_LBRA:        "["
+_RBRA:        "]"
+_STAR:        "*"
+_SLASH:       "/"
+EQ:           "="
+EQ_ALT:       "=/"
+_IGNORE_CASE:    "%i"
+_CASE_SENSITIVE: "%s"
+
+RULE:   /[a-zA-Z][a-zA-Z0-9\-]*/
+
+QSTRING:    /"[ !#$%&\'\(\)\*\+,\-\.\/0-9:;<=>\?@A-Z\[\\\]\^_`a-z\{|\}~]*"/
+PROSE_VAL:  /<[ !"#$%&\'\(\)\*\+,\-\.\/0-9:;<=\?@A-Z\[\\\]\^_`a-z\{|\}~]*>/
+
+NUMBER:     /[0-9]+/
+DEC_VAL:    /%d([0-9]+(\.[0-9]+)+|[0-9]+\-[0-9]+|[0-9]+)/
+HEX_VAL:    /%x([0-9A-F]+(\.[0-9A-F]+)+|[0-9A-F]+\-[0-9A-F]+|[0-9A-F]+)/
+BIN_VAL:    /%b([01]+(\.[01]+)+|[01]+\-[01]+|[01]+)/
+
+_C_NL:    /(;[^\n]*)*\r?\n/
+_C_WSP:   /((;[^\n]*)*\r?\n)?[ \t]+/
+
+// terminals for nonstandard extensions
+_IMPORT:    "%import"
+_DOT:       "."
+_COMMA:     ","
+
+
+start:            _rulelist
+_rulelist:        (rule | abnf_import | terminal_def | (_C_WSP* _C_NL))+ 
+
+rule:              RULE _defined_as _elements _C_NL
+
+_defined_as:       _C_WSP* (EQ|EQ_ALT) _C_WSP*
+_elements:         alternation _C_WSP*
+alternation:       concatenation (_C_WSP* _SLASH _C_WSP* concatenation)*
+concatenation:     repetition (_C_WSP+ repetition)*
+repetition:        repeat? _element
+
+//  repeat =  1*DIGIT / (*DIGIT "*" *DIGIT)
+repeat:            (repeat_min _STAR repeat_max)|(repeat_min _STAR)|(_STAR repeat_max)|_STAR|repeat_n
+repeat_n:          NUMBER
+repeat_min:        NUMBER
+repeat_max:        NUMBER
+
+_element:          rule_ref|_group|option|char_val|num_val|prose_val
+rule_ref:          RULE
+// 'group' is inlined intentionally.
+_group:            _LPAR _C_WSP* alternation _C_WSP* _RPAR
+option:            _LBRA _C_WSP* alternation _C_WSP* _RBRA
+
+char_val:          case_insensitive_string|case_sensitive_string
+case_insensitive_string:          _IGNORE_CASE? QSTRING
+case_sensitive_string:            _CASE_SENSITIVE QSTRING
+
+num_val:           dec_val|bin_val|hex_val
+dec_val:           DEC_VAL
+hex_val:           HEX_VAL
+bin_val:           BIN_VAL
+
+prose_val:         PROSE_VAL
+
+// nonstandard extensions to ABNF grammar
+// (%import)
+abnf_import:       _import1
+_import1:          _IMPORT _C_WSP+ _import_path _C_WSP* name_list? _C_WSP* _C_NL
+_import_path:      import_from_lib|import_relpath
+import_from_lib:   _import_args
+import_relpath:    _DOT _import_args
+_import_args:      PATHNAME (_DOT PATHNAME)*
+name_list:         _LPAR _C_WSP* RULE (_C_WSP* _COMMA _C_WSP* RULE)* _C_WSP* _RPAR
+
+PATHNAME:    /[!#$%&\'\+,\-0-9;=@A-Z\[\]\^_a-z`\{\}~]+/
+
+// (%terminal)
+terminal_def:   _TERMINAL _C_WSP+ RULE (_C_WSP* _COMMA _C_WSP* RULE)*
+_TERMINAL:      "%terminal"
diff --git a/lark/grammars/core-rules.abnf b/lark/grammars/core-rules.abnf
@@ -0,0 +1,39 @@
+; ABNF Core Rules (RFC5234 Appendix.B)
+
+ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
+BIT            =  "0" / "1"
+CHAR           =  %x01-7F
+                        ; any 7-bit US-ASCII character,
+                        ;  excluding NUL
+CR             =  %x0D
+                        ; carriage return
+CRLF           =  CR LF
+                        ; Internet standard newline
+CTL            =  %x00-1F / %x7F
+                        ; controls
+DIGIT          =  %x30-39
+                        ; 0-9
+DQUOTE         =  %x22
+                        ; " (Double Quote)
+HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+HTAB           =  %x09
+                        ; horizontal tab
+LF             =  %x0A
+                        ; linefeed
+LWSP           =  *(WSP / CRLF WSP)
+                        ; Use of this linear-white-space rule
+                        ;  permits lines containing only white
+                        ;  space that are no longer legal in
+                        ;  mail headers and have caused
+                        ;  interoperability problems in other
+                        ;  contexts.
+                        ; Do not use when defining mail
+                        ;  headers and use with caution in
+                        ;  other contexts.
+OCTET          =  %x00-FF
+                        ; 8 bits of data
+SP             =  %x20
+VCHAR          =  %x21-7E
+                        ; visible (printing) characters
+WSP            =  SP / HTAB
+                        ; white space
diff --git a/lark/lark.py b/lark/lark.py
@@ -98,6 +98,11 @@ class LarkOptions(Serialize):
             Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``)
     tree_class
             Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
+    syntax
+            Syntax for grammar specification.
+
+            - "lark" (default): Lark's EBNF based syntax
+            - "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported.
 
     **=== Algorithm Options ===**
 
@@ -169,6 +174,7 @@ class LarkOptions(Serialize):
         'use_bytes': False,
         'import_paths': [],
         'source_path': None,
+        'syntax': 'lark',
     }
 
     def __init__(self, options_dict):
@@ -328,7 +334,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
 
 
             # Parse the grammar file and compose the grammars
-            self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens, self.options.syntax)
         else:
             assert isinstance(grammar, Grammar)
             self.grammar = grammar