Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Earley: share nodes created by the scanner with the completer #1451

Merged
merged 1 commit into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 11 additions & 17 deletions lark/parsers/earley.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matc
self.term_matcher = term_matcher


def predict_and_complete(self, i, to_scan, columns, transitives):
def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
"""The core Earley Predictor and Completer.

At each stage of the input, we handling any completed items (things
Expand All @@ -84,7 +84,6 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
# Held Completions (H in E.Scotts paper).
node_cache = {}
held_completions = {}

column = columns[i]
Expand Down Expand Up @@ -203,7 +202,7 @@ def scan(i, token, to_scan):
for item in self.Set(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
label = (new_item.s, new_item.start, i + 1)
# 'terminals' may not contain token.type when using %declare
# Additionally, token is not always a Token
# For example, it can be a Tree when using TreeMatcher
Expand All @@ -227,7 +226,7 @@ def scan(i, token, to_scan):
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

return next_to_scan
return next_to_scan, node_cache


# Define parser functions
Expand All @@ -245,16 +244,17 @@ def scan(i, token, to_scan):
# step.
expects = {i.expect for i in to_scan}
i = 0
node_cache = {}
for token in lexer.lex(expects):
self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

to_scan = scan(i, token, to_scan)
to_scan, node_cache = scan(i, token, to_scan)
i += 1

expects.clear()
expects |= {i.expect for i in to_scan}

self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

## Column is now the final column in the parse.
assert i == len(columns)-1
Expand Down Expand Up @@ -294,24 +294,18 @@ def parse(self, lexer, start):
except ImportError:
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
for i, s in enumerate(solutions):
debug_walker.visit(s, f"sppf{i}.png")
debug_walker.visit(solutions[0], "sppf.png")

if len(solutions) > 1:
assert False, 'Earley should not generate multiple start symbol items!'

if self.Tree is not None:
# Perform our SPPF -> AST conversion
# Disable the ForestToParseTree cache when ambiguity='resolve'
# to prevent a tree construction bug. See issue #1283
use_cache = not self.resolve_ambiguity
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
solutions = [transformer.transform(s) for s in solutions]

if len(solutions) > 1 and not self.resolve_ambiguity:
t: Tree = self.Tree('_ambig', solutions)
t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
return t
return solutions[0]
return transformer.transform(solutions[0])

# return the root of the SPPF
# TODO return a list of solutions, or join them together somehow
return solutions[0]
9 changes: 5 additions & 4 deletions lark/parsers/xearley.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def scan(i, to_scan):
considered_rules=considered_rules
)

return next_to_scan
return next_to_scan, node_cache


delayed_matches = defaultdict(list)
Expand All @@ -146,10 +146,11 @@ def scan(i, to_scan):
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
i = 0
node_cache = {}
for token in stream:
self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

to_scan = scan(i, to_scan)
to_scan, node_cache = scan(i, to_scan)

if token == '\n':
text_line += 1
Expand All @@ -158,7 +159,7 @@ def scan(i, to_scan):
text_column += 1
i += 1

self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

## Column is now the final column in the parse.
assert i == len(columns)-1
Expand Down
13 changes: 5 additions & 8 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,14 +836,14 @@ def test_multiple_start_solutions(self):
tree = l.parse('x')

expected = Tree('_ambig', [
Tree('start', [Tree('a', ['x'])]),
Tree('start', ['x']),
Tree('start', [Tree('a', ['x'])])]
)
])
self.assertEqual(tree, expected)

l = Lark(grammar, ambiguity='resolve', lexer=LEXER)
tree = l.parse('x')
assert tree == Tree('start', ['x'])
assert tree == Tree('start', [Tree('a', ['x'])])


def test_cycle(self):
Expand Down Expand Up @@ -872,10 +872,7 @@ def test_cycle2(self):
tree = l.parse("ab")
expected = (
Tree('start', [
Tree('_ambig', [
Tree('v', [Tree('v', [])]),
Tree('v', [Tree('v', [Tree('v', [])])])
])
Tree('v', [Tree('v', [])]),
])
)
self.assertEqual(tree, expected)
Expand Down Expand Up @@ -990,7 +987,7 @@ def test_consistent_derivation_order1(self):
''', lexer=LEXER)

tree = parser.parse('..')
n = Tree('a', [Tree('b', [])])
n = Tree('a', [])
assert tree == Tree('start', [n, n])

_NAME = "TestFullEarley" + LEXER.capitalize()
Expand Down
Loading