-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEarleyParser.py
115 lines (92 loc) · 3.28 KB
/
EarleyParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python2
# coding: utf-8
import string
import sys
import re
import Tokenizer
INITIAL_NONTERMINAL = 'Start'
class State:
def __init__(self, begin, nonterminal, parsed, remains, children):
self.begin = begin
self.nonterminal = nonterminal
self.parsed = parsed
self.remains = remains
self.children = children
def __str__(self):
return(
"\033[1;40;97m[ " +
"\033[1;44;97m " + str(self.begin) + " \033[107;30m:"
"\033[1;102;30m" + str(self.parsed) +
"\033[1;103;30m" + str(self.remains) +
"\033[1;40;97m ] > " + str(self.children) + "\033[0m"
)
def __eq__(self, other):
return(
self.begin == other.begin and
self.nonterminal == other.nonterminal and
self.parsed == other.parsed and
self.remains == other.remains
)
def step(self, child):
"""
Pushes one parsed step over state.
"""
return State(
self.begin,
self.nonterminal,
self.parsed + [self.remains[0]],
self.remains[1:],
self.children + [child]
)
_TYPE_SUBST = int
class EarleyParser:
def __init__(self, rules):
self.rules = rules
def match_token(self, template, token):
if type(token) == _TYPE_SUBST and type(template) == _TYPE_SUBST:
return template == token
if type(token) == str and type(template) == str and template not in self.rules:
return True
return False
def push(self, cursor, state):
for s in self.states[cursor]:
if s == state:
return
self.states[cursor].append(state)
def complete(self, completed, cursor):
for s in self.states[completed.begin]:
if len(s.remains) > 0 and s.remains[0] == completed.nonterminal:
self.push(cursor, s.step(completed))
def predict(self, state, i):
for r in self.rules[state.remains[0]]:
self.push(i, State(i, state.remains[0], [], r, []))
def scan(self, state, token, cursor):
if self.match_token(state.remains[0], token):
self.push(cursor + 1, state.step(token))
def parse(self, text):
self.states = [
[
State(0, INITIAL_NONTERMINAL, [], r, [])
for r in self.rules[INITIAL_NONTERMINAL]
]
if i == 0 else [] for i in range(len(text) + 1)
]
for i in range(len(text) + 1):
print "\033[1;92m" + str(text[:i]) + "\033[35m █ \033[93m" + str(text[i:]) + "\033[0m"
for s in self.states[i]:
print s
if s.remains == []:
self.complete(s, i)
elif s.remains[0] in self.rules:
self.predict(s, i)
elif i < len(text):
self.scan(s, text[i], i)
for s in self.states[-1]:
if len(s.remains) == 0 and s.begin == 0 and s.nonterminal == INITIAL_NONTERMINAL:
return s
return None
def PrintTree(s, indent=0):
if type(s) not in [type(None), str, _TYPE_SUBST]:
print '\t' * indent + str(s)
for c in s.children:
PrintTree(c, indent + 1)