-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscanner.py
189 lines (165 loc) · 6.91 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python3
import re
import sys
from pyparsing import OneOrMore, ZeroOrMore, Regex, Keyword, Forward # Used just for pre checking of grammar | Optional
__author__ = 'rohitmehra'
keywords = {'if', 'then', 'else', 'begin', 'end', 'while', 'do', 'program', 'var', 'as', 'int', 'bool',
'writeInt',
'readInt', 'div', 'mod', 'true', 'false'}
MULTIPLICATIVE = {'div', 'mod'}
boollit = {'true', 'false'}
token_specification = [
('num', r'[1-9][0-9]*|0'), # Integer or decimal number rm
('ASGN', r':='), # Assignment operator
('LP', r'\('), # LP
('RP', r'\)'), # RP
('SC', r';'), # Statement terminator rm
('ident', r'[a-z_A-Z][a-zA-Z0-9]*'), # Identifiers
('MULTIPLICATIVE', r'[*]'), # Arithmetic operators
('COMPARE', r'[<][=]|[>][=]|[!][=]|[<]|[>]|[=]'), # Comparatives (rm)
('ADDITIVE', r'[+\-]'), # Additive Operators
('NEWLINE', r'\n'), # Line endings rm
('SKIP', r'[ \t]+|%.*'), # Skip over spaces and tabs
('MISMATCH', r'.'), # Any other character
]
class Token:
def __init__(self, typ, value):
self.type = typ
self.value = value
def __str__(self):
"""String representation of the Token instance.RM"""
if self.value:
return '{type}({value})'.format(
type=self.type,
value=self.value
)
else:
return self.type
def __repr__(self):
return self.__str__()
class Scanner:
def __init__(self, progam='%'):
self.tokens = []
self.token_objects = []
self.opfilename = 'test.tok'
self.program = progam
self.input(progam)
@staticmethod
def tokenize(code_file_string):
all_patterns = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
for match_object in re.finditer(all_patterns, code_file_string):
kind = match_object.lastgroup # matches the first group in the line that is matched
value = match_object.group(kind)
if kind == 'NEWLINE':
line_num += 1
elif kind == 'SKIP':
pass
elif kind == 'MISMATCH' or (kind == 'num' and abs(int(value)) > 2147483647):
# raise RuntimeError('SCANNER ERROR: %r unexpected on line %d' % (value, line_num))
yield Token('SCANNER ERROR: %r unexpected on line %d' % (value, line_num), None)
else:
if kind == 'ident' and (value in keywords):
if value in MULTIPLICATIVE:
kind = 'MULTIPLICATIVE' + '(' + value + ')'
elif value in boollit:
kind = 'boollit' + '(' + value + ')'
else:
kind = value.upper()
if kind == value.upper() or (kind in ['LP', 'RP', 'ASGN', 'SC']) or (value in keywords):
tok = kind # For Keywords
else:
tok = kind + '(' + value + ')' # for XYZ(SYMBOLS)
# Into token Object rm
if '(' in tok:
token = Token(tok[:tok.index('(')], value)
yield token
elif '(' not in tok:
token = Token(kind, None)
yield token
@staticmethod
def grammar_check(token_string): # pre checking grammar with help of external module | Optional
start, end = Keyword('PROGRAM'), Keyword('END')
intg, boole = Keyword('INT'), Keyword('BOOL')
lp, rp = Keyword('LP'), Keyword('RP')
sc = Keyword('SC')
ident = Regex(r'ident\(.+?\)')
num = Regex(r'num\(.+?\)')
compare = Regex(r'COMPARE\(.+?\)')
additive = Regex(r'ADDITIVE\(.+?\)')
multi = Regex(r'MULTIPLICATIVE\(.+?\)')
boollit = Regex(r'boollit\(.+?\)')
typ = intg ^ boole
expression = Forward()
statementSequence = Forward()
factor = ident ^ num ^ boollit ^ (lp + expression + rp)
term = (OneOrMore(factor + multi) + factor) ^ factor
simpleExpression = (OneOrMore(term + additive) + term) ^ term
expression <<= (OneOrMore(simpleExpression + compare) + simpleExpression) ^ simpleExpression
elseClause = ZeroOrMore('ELSE' + statementSequence)
ifStatement = 'IF' + expression + 'THEN' + statementSequence + elseClause + end
whileStatement = 'WHILE' + expression + 'DO' + statementSequence + end
assignment = ((ident + 'ASGN' + expression) ^ (ident + 'ASGN' + 'READINT'))
writeInt = 'WRITEINT' + expression
statement = (assignment ^ ifStatement ^ whileStatement ^ writeInt)
statementSequence <<= ZeroOrMore(statement + sc)
declarations = ZeroOrMore('VAR' + ident + 'AS' + typ + sc)
program = start + declarations + 'BEGIN' + statementSequence + end
try:
program.parseString(token_string)
return True
except:
return False
def input(self, prog='%'):
if prog == '%':
try:
with open(sys.argv[1], 'r') as string:
if str(sys.argv[1])[-3:] != '.tl':
raise RuntimeError('Only .tl Files can be Parsed!!')
self.file = string.read()
self.opfilename = str(sys.argv[1][:-2]) + 'tok'
except UnicodeDecodeError:
with open(sys.argv[1], 'r', encoding='latin1') as string:
self.file = string.read()
self.opfilename = str(sys.argv[1][:-2]) + 'tok'
except Exception as e:
print('FileNotFound: %s' % e)
sys.exit(2)
self.program = self.file
def parse(self):
token_error = False
for token in self.tokenize(self.program):
if token.__repr__().startswith('SCANNER ERROR'):
token_error = token.__repr__()
else:
self.tokens.append(token.__repr__())
self.token_objects.append(token)
if token_error:
with open(self.opfilename, 'w') as file:
file.write(token_error)
print(token_error)
sys.exit(0)
else:
token_stream = '\n'.join(self.tokens)
if not self.grammar_check(token_stream):
with open(self.opfilename, 'w') as file:
file.write('SYNTAX ERROR')
print('SYNTAX ERROR')
else:
with open(self.opfilename, 'w') as file:
file.writelines(token_stream)
if __name__ == '__main__':
prog = '''program
var N as int ;
var SQRT as int ;
begin
N := readInt ;
SQRT := 0 ;
while SQRT * SQRT <= N do
SQRT := SQRT + 1 ;
end ;
SQRT := SQRT - 1 ;
writeInt SQRT ; %(rm)
end '''
s = Scanner() # object with input initialized
s.parse() # tokenize and update tokens and token_objects list of scanner object