Merge pull request #1260 from lark-parser/MegaIng-interegular-two

Rephrased docs for interegular PR (#1258)
lark-parser · Mar 8, 2023 · 1daba2c · 1daba2c
2 parents 2564232 + ac36ed0
commit 1daba2c
Show file tree

Hide file tree

Showing 12 changed files with 159 additions and 39 deletions.
diff --git a/docs/features.md b/docs/features.md
@@ -11,6 +11,7 @@
  - Flexible error handling by using an interactive parser interface (LALR only)
  - Automatic line & column tracking (for both tokens and matched rules)
  - Automatic terminal collision resolution
+ - Warns on regex collisions using the optional `interegular` library. ([read more](how_to_use.html#regex-collisions))
  - Grammar composition - Import terminals and rules from other grammars
  - Standard library of terminals (strings, numbers, names, etc.)
  - Unicode fully supported

diff --git a/docs/how_to_use.md b/docs/how_to_use.md
@@ -42,6 +42,38 @@ Usually, by the time you get to a minimal grammar, the problem becomes clear.
 
 But if it doesn't, feel free to ask us on gitter, or even open an issue. Post a reproducing code, with the minimal grammar and input, and we'll do our best to help.
 
+### Regex collisions
+
+A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.)
+
+These collisions can be hard to notice, and their effects can be difficult to debug, as they are subtle and sometimes hard to reproduce.
+
+To help with these situations, Lark can utilize a new external library called `interegular`. If it is installed, Lark uses it to check for collisions, and warn about any conflicts that it can find:
+
+```
+import logging
+from lark import Lark, logger
+
+logger.setLevel(logging.WARN)
+
+collision_grammar = '''
+start: A | B
+A: /a+/
+B: /[ab]+/
+'''
+p = Lark(collision_grammar, parser='lalr')
+
+# Output:
+# Collision between Terminals B and A. The lexer will choose between them arbitrarily
+# Example Collision: a
+```
+
+You can install interegular for Lark using `pip install 'lark[interegular]'`.
+
+Note 1: Interegular currently only runs when the lexer is `basic` or `contextual`.
+
+Note 2: Some advanced regex features, such as lookahead and lookbehind, may prevent interegular from detecting existing collisions.
+
 ### LALR
 
 By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example:

diff --git a/examples/advanced/python_parser.py b/examples/advanced/python_parser.py
@@ -11,11 +11,14 @@
 from io import open
 import glob, time
 
-from lark import Lark
+from lark import Lark, logger
 from lark.indenter import PythonIndenter
+import logging
 
+logger.setLevel(logging.DEBUG)
 
-kwargs = dict(postlex=PythonIndenter(), start='file_input')
+
+kwargs = dict(postlex=PythonIndenter(), start='file_input', cache=False)
 
 # Official Python grammar by Lark
 python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'], parser='lalr', **kwargs)

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -9,16 +9,25 @@
 )
 from types import ModuleType
 import warnings
+try:
+ import interegular
+except ImportError:
+ pass
 if TYPE_CHECKING:
  from .common import LexerConf
 
-from .utils import classify, get_regexp_width, Serialize
+from .utils import classify, get_regexp_width, Serialize, logger
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
 from .grammar import TOKEN_DEFAULT_PRIORITY
 
+
 ###{standalone
 from copy import copy
 
+try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
+ has_interegular = bool(interegular)
+except NameError:
+ has_interegular = False
 
 class Pattern(Serialize, ABC):
 
@@ -27,7 +36,7 @@ class Pattern(Serialize, ABC):
  raw: Optional[str]
  type: ClassVar[str]
 
- def __init__(self, value: str, flags: Collection[str]=(), raw: Optional[str]=None) -> None:
+ def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
  self.value = value
  self.flags = frozenset(flags)
  self.raw = raw
@@ -110,7 +119,7 @@ class TerminalDef(Serialize):
  pattern: Pattern
  priority: int
 
- def __init__(self, name: str, pattern: Pattern, priority: int=TOKEN_DEFAULT_PRIORITY) -> None:
+ def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
  assert isinstance(pattern, Pattern), pattern
  self.name = name
  self.pattern = pattern
@@ -120,7 +129,7 @@ def __repr__(self):
  return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
 
  def user_repr(self) -> str:
- if self.name.startswith('__'): # We represent a generated terminal
+ if self.name.startswith('__'):  # We represent a generated terminal
  return self.pattern.raw or self.name
  else:
  return self.name
@@ -162,29 +171,29 @@ class Token(str):
 
  @overload
  def __new__(
- cls,
- type: str,
- value: Any,
- start_pos: Optional[int]=None,
- line: Optional[int]=None,
- column: Optional[int]=None,
- end_line: Optional[int]=None,
- end_column: Optional[int]=None,
- end_pos: Optional[int]=None
+  cls,
+  type: str,
+  value: Any,
+  start_pos: Optional[int] = None,
+  line: Optional[int] = None,
+  column: Optional[int] = None,
+  end_line: Optional[int] = None,
+  end_column: Optional[int] = None,
+  end_pos: Optional[int] = None
  ) -> 'Token':
  ...
 
  @overload
  def __new__(
- cls,
- type_: str,
- value: Any,
- start_pos: Optional[int]=None,
- line: Optional[int]=None,
- column: Optional[int]=None,
- end_line: Optional[int]=None,
- end_column: Optional[int]=None,
- end_pos: Optional[int]=None
+  cls,
+  type_: str,
+  value: Any,
+  start_pos: Optional[int] = None,
+  line: Optional[int] = None,
+  column: Optional[int] = None,
+  end_line: Optional[int] = None,
+  end_column: Optional[int] = None,
+  end_pos: Optional[int] = None
  ) -> 'Token': ...
 
  def __new__(cls, *args, **kwargs):
@@ -213,11 +222,11 @@ def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_li
  return inst
 
  @overload
- def update(self, type: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
+ def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  ...
 
  @overload
- def update(self, type_: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
+ def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  ...
 
  def update(self, *args, **kwargs):
@@ -230,7 +239,7 @@ def update(self, *args, **kwargs):
 
  return self._future_update(*args, **kwargs)
 
- def _future_update(self, type: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
+ def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
  return Token.new_borrow_pos(
  type if type is not None else self.type,
  value if value is not None else self.value,
@@ -364,7 +373,7 @@ def _build_mres(self, terminals, max_size):
  try:
  mre = self.re_.compile(pattern, self.g_regex_flags)
  except AssertionError: # Yes, this is what Python provides us.. :/
- return self._build_mres(terminals, max_size//2)
+ return self._build_mres(terminals, max_size // 2)
 
  mres.append(mre)
  terminals = terminals[max_size:]
@@ -457,26 +466,45 @@ class BasicLexer(Lexer):
  callback: Dict[str, _Callback]
  re: ModuleType
 
- def __init__(self, conf: 'LexerConf') -> None:
+ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
  terminals = list(conf.terminals)
  assert all(isinstance(t, TerminalDef) for t in terminals), terminals
 
  self.re = conf.re_module
 
  if not conf.skip_validation:
  # Sanitization
+ terminal_to_regexp = {}
  for t in terminals:
+ regexp = t.pattern.to_regexp()
  try:
- self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
+ self.re.compile(regexp, conf.g_regex_flags)
  except self.re.error:
  raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
 
  if t.pattern.min_width == 0:
  raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
+ if t.pattern.type == "re":
+ terminal_to_regexp[t] = regexp
 
  if not (set(conf.ignore) <= {t.name for t in terminals}):
  raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))
 
+ if has_interegular:
+ if not comparator:
+ comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
+ for group in classify(terminal_to_regexp, lambda t: t.priority).values():
+ for a, b in comparator.check(group, skip_marked=True):
+ assert a.priority == b.priority
+ # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
+ comparator.mark(a, b)
+
+ # leave it as a warning for the moment
+ # raise LexError("Collision between Terminals %s and %s" % (a.name, b.name))
+ example = comparator.get_example_overlap(a, b).format_multiline()
+ logger.warning(f"Collision between Terminals {a.name} and {b.name}. "
+ f"The lexer will choose between them arbitrarily.\n" + example)
+
  # Init
  self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
  self.ignore_types = frozenset(conf.ignore)
@@ -517,7 +545,7 @@ def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
  while True:
  yield self.next_token(state, parser_state)
 
- def next_token(self, lex_state: LexerState, parser_state: Any=None) -> Token:
+ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
  line_ctr = lex_state.line_ctr
  while line_ctr.char_pos < len(lex_state.text):
  res = self.match(lex_state.text, line_ctr.char_pos)
@@ -565,6 +593,10 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
  trad_conf = copy(conf)
  trad_conf.terminals = terminals
 
+ if has_interegular and not conf.skip_validation:
+ comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
+ else:
+ comparator = None
  lexer_by_tokens: Dict[FrozenSet[str], BasicLexer] = {}
  self.lexers = {}
  for state, accepts in states.items():
@@ -575,13 +607,14 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
  accepts = set(accepts) | set(conf.ignore) | set(always_accept)
  lexer_conf = copy(trad_conf)
  lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
- lexer = BasicLexer(lexer_conf)
+ lexer = BasicLexer(lexer_conf, comparator)
  lexer_by_tokens[key] = lexer
 
  self.lexers[state] = lexer
 
  assert trad_conf.terminals is terminals
- self.root_lexer = BasicLexer(trad_conf)
+ trad_conf.skip_validation = True # We don't need to verify all terminals again
+ self.root_lexer = BasicLexer(trad_conf, comparator)
 
  def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
  try:

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
@@ -79,7 +79,7 @@
  '_RBRA': r'\]',
  '_LBRACE': r'\{',
  '_RBRACE': r'\}',
- 'OP': '[+*]|[?](?![a-z])',
+ 'OP': '[+*]|[?](?![a-z_])',
  '_COLON': ':',
  '_COMMA': ',',
  '_OR': r'\|',

diff --git a/lark/tools/__init__.py b/lark/tools/__init__.py
@@ -6,6 +6,11 @@
 import warnings
 
 from lark import Lark, logger
+try:
+ from interegular import logger as interegular_logger
+ has_interegular = True
+except ImportError:
+ has_interegular = False
 
 lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')
 
@@ -40,6 +45,8 @@
 
 def build_lalr(namespace):
  logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
+ if has_interegular:
+ interegular_logger.setLevel(logger.getEffectiveLevel())
  if len(namespace.start) == 0:
  namespace.start.append('start')
  kwargs = {n: getattr(namespace, n) for n in options}

diff --git a/lark/utils.py b/lark/utils.py
@@ -3,7 +3,7 @@
 from functools import reduce
 from itertools import product
 from collections import deque
-from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence
+from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable
 
 ###{standalone
 import sys, re
@@ -21,7 +21,7 @@
 T = TypeVar("T")
 
 
-def classify(seq: Sequence, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
+def classify(seq: Iterable, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
  d: Dict[Any, Any] = {}
  for item in seq:
  k = key(item) if (key is not None) else item

diff --git a/setup.py b/setup.py
@@ -15,6 +15,7 @@
  "regex": ["regex"],
  "nearley": ["js2py"],
  "atomic_cache": ["atomicwrites"],
+ "interegular": ["interegular>=0.2.4"],
  },
 
  package_data = {'': ['*.md', '*.lark'], 'lark': ['py.typed']},

diff --git a/test-requirements.txt b/test-requirements.txt
@@ -1,2 +1,3 @@
+interegular>=0.2.4
 Js2Py==0.68
 regex