Merge pull request #1261 from lark-parser/strict_mode

Added strict-mode, enabled using the strict=True flag
lark-parser · Mar 9, 2023 · a9d353e · a9d353e
2 parents 0551acf + d97eaa3
commit a9d353e
Show file tree

Hide file tree

Showing 9 changed files with 54 additions and 15 deletions.
diff --git a/lark/common.py b/lark/common.py
@@ -38,8 +38,10 @@ class LexerConf(Serialize):
     skip_validation: bool
     use_bytes: bool
     lexer_type: Optional[_LexerArgType]
+    strict: bool
 
-    def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False):
+    def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None,
+                 callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False, strict: bool=False):
         self.terminals = terminals
         self.terminals_by_name = {t.name: t for t in self.terminals}
         assert len(self.terminals) == len(self.terminals_by_name)
@@ -50,6 +52,7 @@ def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ig
         self.re_module = re_module
         self.skip_validation = skip_validation
         self.use_bytes = use_bytes
+        self.strict = strict
         self.lexer_type = None
 
     def _deserialize(self):

diff --git a/lark/lark.py b/lark/lark.py
@@ -54,6 +54,7 @@ class LarkOptions(Serialize):
 
     start: List[str]
     debug: bool
+    strict: bool
     transformer: 'Optional[Transformer]'
     propagate_positions: Union[bool, str]
     maybe_placeholders: bool
@@ -81,6 +82,8 @@ class LarkOptions(Serialize):
     debug
             Display debug information and extra warnings. Use only when debugging (Default: ``False``)
             When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed.
+    strict
+            Throw an exception on any potential ambiguity, including shift/reduce conflicts, and regex collisions.
     transformer
             Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
     propagate_positions
@@ -156,6 +159,7 @@ class LarkOptions(Serialize):
     # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument
     _defaults: Dict[str, Any] = {
         'debug': False,
+        'strict': False,
         'keep_all_tokens': False,
         'tree_class': None,
         'cache': False,
@@ -424,7 +428,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
         # TODO Deprecate lexer_callbacks?
         self.lexer_conf = LexerConf(
                 self.terminals, re_module, self.ignore_tokens, self.options.postlex,
-                self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes
+                self.options.lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes, strict=self.options.strict
             )
 
         if self.options.parser:

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -499,11 +499,14 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
                         # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
                         comparator.mark(a, b)
 
-                        # leave it as a warning for the moment
-                        # raise LexError("Collision between Terminals %s and %s" % (a.name, b.name))
+                        # Notify the user
+                        message = f"Collision between Terminals {a.name} and {b.name}. "
                         example = comparator.get_example_overlap(a, b).format_multiline()
-                        logger.warning(f"Collision between Terminals {a.name} and {b.name}. "
-                                       f"The lexer will choose between them arbitrarily.\n" + example)
+                        if conf.strict:
+                            raise LexError(f"{message}\n{example}")
+                        logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
+            elif conf.strict:
+                raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")
 
         # Init
         self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -145,8 +145,9 @@ def create_contextual_lexer(lexer_conf, parser, postlex, options):
 
 def create_lalr_parser(lexer_conf, parser_conf, options=None):
     debug = options.debug if options else False
+    strict = options.strict if options else False
     cls = (options and options._plugins.get('LALR_Parser')) or LALR_Parser
-    return cls(parser_conf, debug=debug)
+    return cls(parser_conf, debug=debug, strict=strict)
 
 _parser_creators['lalr'] = create_lalr_parser
 

diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
@@ -122,8 +122,9 @@ def calculate_sets(rules):
 
 
 class GrammarAnalyzer:
-    def __init__(self, parser_conf, debug=False):
+    def __init__(self, parser_conf, debug=False, strict=False):
         self.debug = debug
+        self.strict = strict
 
         root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
                       for start in parser_conf.start}

diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
@@ -131,8 +131,8 @@ def traverse(x, S, N, X, R, G, F):
 
 
 class LALR_Analyzer(GrammarAnalyzer):
-    def __init__(self, parser_conf, debug=False):
-        GrammarAnalyzer.__init__(self, parser_conf, debug)
+    def __init__(self, parser_conf, debug=False, strict=False):
+        GrammarAnalyzer.__init__(self, parser_conf, debug, strict)
         self.nonterminal_transitions = []
         self.directly_reads = defaultdict(set)
         self.reads = defaultdict(set)
@@ -256,10 +256,18 @@ def compute_lalr1_states(self):
                         rules = [best[1]]
                     else:
                         reduce_reduce.append((state, la, rules))
+                        continue
+
+                rule ,= rules
                 if la in actions:
-                    if self.debug:
+                    if self.strict:
+                        raise GrammarError(f"Shift/Reduce conflict for terminal {la.name}. [strict-mode]\n ")
+                    elif self.debug:
                         logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
-                        logger.warning(' * %s', list(rules)[0])
+                        logger.warning(' * %s', rule)
+                    else:
+                        logger.debug('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
+                        logger.debug(' * %s', rule)
                 else:
                     actions[la] = (Reduce, list(rules)[0])
             m[state] = { k.name: v for k, v in actions.items() }

diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
@@ -14,8 +14,8 @@
 ###{standalone
 
 class LALR_Parser(Serialize):
-    def __init__(self, parser_conf, debug=False):
-        analysis = LALR_Analyzer(parser_conf, debug=debug)
+    def __init__(self, parser_conf, debug=False, strict=False):
+        analysis = LALR_Analyzer(parser_conf, debug=debug, strict=strict)
         analysis.compute_lalr()
         callbacks = parser_conf.callbacks
 

diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -41,7 +41,7 @@ def test_debug(self):
         self.assertIn("A", log)
 
     def test_non_debug(self):
-        logger.setLevel(logging.DEBUG)
+        logger.setLevel(logging.WARNING)
         collision_grammar = '''
         start: as as
         as: a*

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -2625,6 +2625,25 @@ def __default__(self, data, children, meta):
             b = parser.parse(s)
             assert a == b
 
+        @unittest.skipIf(PARSER!='lalr', "strict mode is only supported in lalr for now")
+        def test_strict(self):
+            # Test regex collision
+            grammar = r"""
+            start: A | B
+
+            A: /e?rez/
+            B: /erez?/
+            """
+
+            self.assertRaises(LexError, _Lark, grammar, strict=True)
+
+            # Test shift-reduce collision
+            grammar = r"""
+            start: a "."
+            a: "."+
+            """
+            self.assertRaises(GrammarError, _Lark, grammar, strict=True)
+
 
     _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
     _TestParser.__name__ = _NAME