Skip to content

Commit

Permalink
Merge pull request #1260 from lark-parser/MegaIng-interegular-two
Browse files Browse the repository at this point in the history
Rephrased docs for interegular PR (#1258)
  • Loading branch information
erezsh committed Mar 8, 2023
2 parents 2564232 + ac36ed0 commit 1daba2c
Show file tree
Hide file tree
Showing 12 changed files with 159 additions and 39 deletions.
1 change: 1 addition & 0 deletions docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- Flexible error handling by using an interactive parser interface (LALR only)
- Automatic line & column tracking (for both tokens and matched rules)
- Automatic terminal collision resolution
- Warns on regex collisions using the optional `interegular` library. ([read more](how_to_use.html#regex-collisions))
- Grammar composition - Import terminals and rules from other grammars
- Standard library of terminals (strings, numbers, names, etc.)
- Unicode fully supported
Expand Down
32 changes: 32 additions & 0 deletions docs/how_to_use.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,38 @@ Usually, by the time you get to a minimal grammar, the problem becomes clear.

But if it doesn't, feel free to ask us on gitter, or even open an issue. Post a reproducing code, with the minimal grammar and input, and we'll do our best to help.

### Regex collisions

A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.)

These collisions can be hard to notice, and their effects can be difficult to debug, as they are subtle and sometimes hard to reproduce.

To help with these situations, Lark can utilize a new external library called `interegular`. If it is installed, Lark uses it to check for collisions, and warn about any conflicts that it can find:

```
import logging
from lark import Lark, logger
logger.setLevel(logging.WARN)
collision_grammar = '''
start: A | B
A: /a+/
B: /[ab]+/
'''
p = Lark(collision_grammar, parser='lalr')
# Output:
# Collision between Terminals B and A. The lexer will choose between them arbitrarily
# Example Collision: a
```

You can install interegular for Lark using `pip install 'lark[interegular]'`.

Note 1: Interegular currently only runs when the lexer is `basic` or `contextual`.

Note 2: Some advanced regex features, such as lookahead and lookbehind, may prevent interegular from detecting existing collisions.

### LALR

By default Lark silently resolves Shift/Reduce conflicts as Shift. To enable warnings pass `debug=True`. To get the messages printed you have to configure the `logger` beforehand. For example:
Expand Down
7 changes: 5 additions & 2 deletions examples/advanced/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@
from io import open
import glob, time

from lark import Lark
from lark import Lark, logger
from lark.indenter import PythonIndenter
import logging

logger.setLevel(logging.DEBUG)

kwargs = dict(postlex=PythonIndenter(), start='file_input')

kwargs = dict(postlex=PythonIndenter(), start='file_input', cache=False)

# Official Python grammar by Lark
python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'], parser='lalr', **kwargs)
Expand Down
95 changes: 64 additions & 31 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,25 @@
)
from types import ModuleType
import warnings
try:
import interegular
except ImportError:
pass
if TYPE_CHECKING:
from .common import LexerConf

from .utils import classify, get_regexp_width, Serialize
from .utils import classify, get_regexp_width, Serialize, logger
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
from .grammar import TOKEN_DEFAULT_PRIORITY


###{standalone
from copy import copy

try: # For the standalone parser, we need to make sure that has_interegular is False to avoid NameErrors later on
has_interegular = bool(interegular)
except NameError:
has_interegular = False

class Pattern(Serialize, ABC):

Expand All @@ -27,7 +36,7 @@ class Pattern(Serialize, ABC):
raw: Optional[str]
type: ClassVar[str]

def __init__(self, value: str, flags: Collection[str]=(), raw: Optional[str]=None) -> None:
def __init__(self, value: str, flags: Collection[str] = (), raw: Optional[str] = None) -> None:
self.value = value
self.flags = frozenset(flags)
self.raw = raw
Expand Down Expand Up @@ -110,7 +119,7 @@ class TerminalDef(Serialize):
pattern: Pattern
priority: int

def __init__(self, name: str, pattern: Pattern, priority: int=TOKEN_DEFAULT_PRIORITY) -> None:
def __init__(self, name: str, pattern: Pattern, priority: int = TOKEN_DEFAULT_PRIORITY) -> None:
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
Expand All @@ -120,7 +129,7 @@ def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)

def user_repr(self) -> str:
if self.name.startswith('__'): # We represent a generated terminal
if self.name.startswith('__'): # We represent a generated terminal
return self.pattern.raw or self.name
else:
return self.name
Expand Down Expand Up @@ -162,29 +171,29 @@ class Token(str):

@overload
def __new__(
cls,
type: str,
value: Any,
start_pos: Optional[int]=None,
line: Optional[int]=None,
column: Optional[int]=None,
end_line: Optional[int]=None,
end_column: Optional[int]=None,
end_pos: Optional[int]=None
cls,
type: str,
value: Any,
start_pos: Optional[int] = None,
line: Optional[int] = None,
column: Optional[int] = None,
end_line: Optional[int] = None,
end_column: Optional[int] = None,
end_pos: Optional[int] = None
) -> 'Token':
...

@overload
def __new__(
cls,
type_: str,
value: Any,
start_pos: Optional[int]=None,
line: Optional[int]=None,
column: Optional[int]=None,
end_line: Optional[int]=None,
end_column: Optional[int]=None,
end_pos: Optional[int]=None
cls,
type_: str,
value: Any,
start_pos: Optional[int] = None,
line: Optional[int] = None,
column: Optional[int] = None,
end_line: Optional[int] = None,
end_column: Optional[int] = None,
end_pos: Optional[int] = None
) -> 'Token': ...

def __new__(cls, *args, **kwargs):
Expand Down Expand Up @@ -213,11 +222,11 @@ def _future_new(cls, type, value, start_pos=None, line=None, column=None, end_li
return inst

@overload
def update(self, type: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
def update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
...

@overload
def update(self, type_: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
...

def update(self, *args, **kwargs):
Expand All @@ -230,7 +239,7 @@ def update(self, *args, **kwargs):

return self._future_update(*args, **kwargs)

def _future_update(self, type: Optional[str]=None, value: Optional[Any]=None) -> 'Token':
def _future_update(self, type: Optional[str] = None, value: Optional[Any] = None) -> 'Token':
return Token.new_borrow_pos(
type if type is not None else self.type,
value if value is not None else self.value,
Expand Down Expand Up @@ -364,7 +373,7 @@ def _build_mres(self, terminals, max_size):
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size//2)
return self._build_mres(terminals, max_size // 2)

mres.append(mre)
terminals = terminals[max_size:]
Expand Down Expand Up @@ -457,26 +466,45 @@ class BasicLexer(Lexer):
callback: Dict[str, _Callback]
re: ModuleType

def __init__(self, conf: 'LexerConf') -> None:
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
terminals = list(conf.terminals)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

self.re = conf.re_module

if not conf.skip_validation:
# Sanitization
terminal_to_regexp = {}
for t in terminals:
regexp = t.pattern.to_regexp()
try:
self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
self.re.compile(regexp, conf.g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
if t.pattern.type == "re":
terminal_to_regexp[t] = regexp

if not (set(conf.ignore) <= {t.name for t in terminals}):
raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))

if has_interegular:
if not comparator:
comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
for group in classify(terminal_to_regexp, lambda t: t.priority).values():
for a, b in comparator.check(group, skip_marked=True):
assert a.priority == b.priority
# Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
comparator.mark(a, b)

# leave it as a warning for the moment
# raise LexError("Collision between Terminals %s and %s" % (a.name, b.name))
example = comparator.get_example_overlap(a, b).format_multiline()
logger.warning(f"Collision between Terminals {a.name} and {b.name}. "
f"The lexer will choose between them arbitrarily.\n" + example)

# Init
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
self.ignore_types = frozenset(conf.ignore)
Expand Down Expand Up @@ -517,7 +545,7 @@ def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
while True:
yield self.next_token(state, parser_state)

def next_token(self, lex_state: LexerState, parser_state: Any=None) -> Token:
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
Expand Down Expand Up @@ -565,6 +593,10 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
trad_conf = copy(conf)
trad_conf.terminals = terminals

if has_interegular and not conf.skip_validation:
comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
else:
comparator = None
lexer_by_tokens: Dict[FrozenSet[str], BasicLexer] = {}
self.lexers = {}
for state, accepts in states.items():
Expand All @@ -575,13 +607,14 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
lexer_conf = copy(trad_conf)
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
lexer = BasicLexer(lexer_conf)
lexer = BasicLexer(lexer_conf, comparator)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

assert trad_conf.terminals is terminals
self.root_lexer = BasicLexer(trad_conf)
trad_conf.skip_validation = True # We don't need to verify all terminals again
self.root_lexer = BasicLexer(trad_conf, comparator)

def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
try:
Expand Down
2 changes: 1 addition & 1 deletion lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
'_RBRA': r'\]',
'_LBRACE': r'\{',
'_RBRACE': r'\}',
'OP': '[+*]|[?](?![a-z])',
'OP': '[+*]|[?](?![a-z_])',
'_COLON': ':',
'_COMMA': ',',
'_OR': r'\|',
Expand Down
7 changes: 7 additions & 0 deletions lark/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
import warnings

from lark import Lark, logger
try:
from interegular import logger as interegular_logger
has_interegular = True
except ImportError:
has_interegular = False

lalr_argparser = ArgumentParser(add_help=False, epilog='Look at the Lark documentation for more info on the options')

Expand Down Expand Up @@ -40,6 +45,8 @@

def build_lalr(namespace):
logger.setLevel((ERROR, WARN, INFO, DEBUG)[min(namespace.verbose, 3)])
if has_interegular:
interegular_logger.setLevel(logger.getEffectiveLevel())
if len(namespace.start) == 0:
namespace.start.append('start')
kwargs = {n: getattr(namespace, n) for n in options}
Expand Down
4 changes: 2 additions & 2 deletions lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import reduce
from itertools import product
from collections import deque
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable

###{standalone
import sys, re
Expand All @@ -21,7 +21,7 @@
T = TypeVar("T")


def classify(seq: Sequence, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
def classify(seq: Iterable, key: Optional[Callable] = None, value: Optional[Callable] = None) -> Dict:
d: Dict[Any, Any] = {}
for item in seq:
k = key(item) if (key is not None) else item
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"regex": ["regex"],
"nearley": ["js2py"],
"atomic_cache": ["atomicwrites"],
"interegular": ["interegular>=0.2.4"],
},

package_data = {'': ['*.md', '*.lark'], 'lark': ['py.typed']},
Expand Down
1 change: 1 addition & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
interegular>=0.2.4
Js2Py==0.68
regex
Loading

0 comments on commit 1daba2c

Please sign in to comment.