-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcaso_lexer.py
187 lines (156 loc) · 8.61 KB
/
caso_lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import re
from caso_exception import CASOIllegalTokenError
class Token:
def __init__(self, type_, value, line_num, char_pos):
self.type = type_
self.value = value
self.line_num = line_num
self.char_pos = char_pos
def __repr__(self):
return f"Token({repr(self.type)}, {repr(self.value)}, {repr(self.line_num)}, {repr(self.char_pos)})"
class CASOLexer:
TOKEN_SPECIFICATION = [
("NUMBER", r"\d+(\.\d*)?"), # Integer or decimal numbers
# Types
("INT", r"\bInt\b"), # Int type
("FLOAT", r"\bFloat\b"), # Float type
("BOOL", r"\bBool\b"), # Bool type
('STRING', r'\bStr\b'), # String type
('ANY', r'\bAny\b'), # Any type
('EMPTY', r'\bEmpty\b'), # Empty type (used for void functions)
# Keywords
('LET', r'\blet\b'), # Let keyword, used for variable declaration
('CONST', r'\bconst\b'), # Const keyword, used for constant declaration
('WHEN', r'\bwhen\b'), # When keyword, used for pattern matching
('FUNCTION', r'\bfnc\b'), # Function keyword, used for function declaration
('IF', r'\bif\b'), # If keyword, used for if statements
('ELSE', r'\belse\b'), # Else keyword, used for else statements
('ELSIF', r'\belsif\b'), # Else if keyword, used for else if statements
('LOOP', r'\bloop\b'), # Loop keyword, used for loops
('TO', r'\bto\b'), # To keyword, used for loops (example: loop (i, 10 to 20))
('OBJECT', r'\bobj\b'), # Object keyword, used for object declaration (classses in Java)
('INIT', r'\binit\b'), # Init keyword, used for object initialization (constructor in Java)
('TAKE', r'\btake\b'), # Take keyword
('FROM', r'\bfrom\b'), # From keyword
('INCORPORATE', r'\bincorp\b'), # Incorporate keyword
('SHARED', r'\bshared\b'), # Shared keyword (static in Java)
# Boolean values
('TRUE', r'\btrue\b'), # True keyword
('FALSE', r'\bfalse\b'), # False keyword
# LERSILER (Lexer, Parser, Transpiler) delimiters, these will be - likely - not used by the user but rather at the parser, transpiler and lexer level
('NATIVE_JAVA_START', r'\bcaso___native_java_start\b'), # Native Java start keyword
('NATIVE_JAVA_END', r'\bcaso___native_java_end\b'), # Native Java end keyword
('PARSER_PAUSE', r'\bcaso___parser_pause\b'), # Parser pause keyword
('PARSER_RESUME', r'\bcaso___parser_resume\b'), # Parser resume keyword
('TRANSPILE_PAUSE', r'\bcaso___transpile_pause\b'), # Transpiler pause keyword
('TRANSPILE_RESUME', r'\bcaso___transpile_resume\b'), # Transpiler resume keyword
('GENERAL_JAVA_TOKEN', r'\bcaso___general_java_token\b'), # General Java token keyword
# Comparison operators
('EQ', r'=='), # Equality operator
('NEQ', r'!='), # Inequality operator
('LE', r'<='), # Less than or equal to operator
('LT', r'<'), # Less than operator
('PREDICATE', r'\$\>'), # Predicate return start operator
('GE', r'>='), # Greater than or equal to operator
('GT', r'>'), # Greater than operator
('UKN', r'\?'), # Unknown operator (used for pattern matching)
('AND', r'&&'), # And operator
('OR', r'\|\|'), # Or operator
# Operators
('ASSIGN', r'='), # Assignment operator
('REASSIGN', r':='), # Reassignment operator
('IMPORT', r'::'), # Import operator (used for importing specific modules, rather than the entire library)
('TYPE_ASSIGN', r':' ), # Type assignment operator
('ARROW', r'->'), # Arrow operator
('INHERIT', r'@'), # Inheritance operator
('PIPE', r'\|'), # Pipe operator (for now used for returns)
('DOT', r'\.'), # Dot operator (used for object properties)
('AT', r'\*\*'), # At operator
# Characters
('OPEN_PAREN', r'\('), # Open parenthesis'
('CLOSE_PAREN', r'\)'), # Close parenthesis
('OPEN_BRACKET', r'\['), # Open bracket
('CLOSE_BRACKET', r'\]'), # Close bracket
('OPEN_BRACE', r'\{'), # Open brace
('CLOSE_BRACE', r'\}'), # Close brace
("COMMENT" , r"//.*"), # Comment
("COMMA", r","), # Comma
("SEMICOLON", r";"), # Semicolon
('EXCLAMATION', r'!'), # Exclamation mark
# Arithmetic operators (putting them after the other operators to avoid conflicts)
('PLUS', r'\+'), # Addition operator
('MINUS', r'-'), # Subtraction operator
('MUL', r'\*'), # Multiplication operator
('DIV', r'/'), # Division operator
('MOD', r'%'), # Modulo operator
# Other
("ID", r"[A-Za-z0-9_]+"), # Identifiers (allowing alphanumeric characters and underscore)
("STRING_LITERAL", r'"(?:\\.|[^"\\])*"|\'(?:\\.|[^\'\\])*\''), # String literals
("NEWLINE", r"\n"), # Line endings
("SKIP", r"[ \t]+"), # Skip over spaces and tabs
]
def __init__(self, source_code):
self.source_code = source_code
self.tokens = []
self.current_position = 0
self.tok_regex = re.compile("|".join("(?P<%s>%s)" % pair for pair in self.TOKEN_SPECIFICATION))
self.java_blocks = [] # Stores the extracted Java blocks
def preprocess_java_blocks(self):
"""Extracts Java blocks and replaces them with placeholders in the source code."""
java_start_tag = 'caso___native_java_start'
java_end_tag = 'caso___native_java_end'
placeholder_format = "<JAVA_BLOCK_{}>"
start_pos = 0
block_counter = 0
while True:
start_index = self.source_code.find(java_start_tag, start_pos)
if start_index == -1:
break # No more Java blocks
end_index = self.source_code.find(java_end_tag, start_index)
if end_index == -1:
raise CASOIllegalTokenError("Unmatched 'caso___native_java_start'", 0, start_index)
# Extract and store the Java block
java_code = self.source_code[start_index + len(java_start_tag):end_index].strip()
self.java_blocks.append(java_code)
# Replace the Java block in the source code with a placeholder
placeholder = placeholder_format.format(block_counter)
before_block = self.source_code[:start_index]
after_block = self.source_code[end_index + len(java_end_tag):]
self.source_code = before_block + placeholder + after_block
start_pos = start_index + len(placeholder) # Update start_pos to continue search
block_counter += 1
def tokenize(self):
self.preprocess_java_blocks() # Handle Java blocks first
line_num = 1
line_start = 0
while self.current_position < len(self.source_code):
# Check for Java block placeholders in the simplified source code
if self.source_code[self.current_position:].startswith('<JAVA_BLOCK_'):
end_index = self.source_code.find('>', self.current_position)
placeholder = self.source_code[self.current_position:end_index+1]
block_index = int(placeholder.strip('<JAVA_BLOCK_>').strip('>'))
java_code = self.java_blocks[block_index]
# Add a token for the entire Java block
self.tokens.append(Token("GENERAL_JAVA_TOKEN", java_code, line_num, self.current_position - line_start))
# Update current_position to skip over the placeholder
self.current_position = end_index + 1
continue # Move to the next segment of the source code
match = self.tok_regex.match(self.source_code, self.current_position)
if match:
type_ = match.lastgroup
char_pos = match.start() - line_start
if type_ == "NEWLINE":
line_start = match.end()
line_num += 1
# Adding the newline token to the list of tokens
self.tokens.append(Token(type_, '\n', line_num, char_pos))
elif type_ != "SKIP" and type_ != "COMMENT":
value = match.group(type_)
if type_ == "NUMBER":
value = float(value) if '.' in value else int(value)
self.tokens.append(Token(type_, value, line_num, char_pos))
self.current_position = match.end()
else:
# This block will catch any unrecognized characters or sequences
raise Exception(f"Illegal character '{self.source_code[self.current_position]}' at line {line_num}")
return self.tokens