-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlex.cpp
86 lines (70 loc) · 2.43 KB
/
lex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/* ollieberzs 2018
** lex.cpp
** lexing oca source into tokens
*/
#include <iostream>
#include <regex>
#include "oca.hpp"
OCA_BEGIN
void Token::print() const {
std::vector<std::string> typestrings = {
"string", "fstring", "binnum", "hexnum", "scientnum", "real",
"integer", "boolean", "filepath", "keyword", "name", "operator",
"punctuation", "comment", "indent", "whitespace", "invalid", "last"};
std::cout << "<" << typestrings[type] << ">";
if (type != Type::INDENT)
std::cout << val;
else
std::cout << val.size() - 1;
std::cout << "\n";
}
//-----------------------------
Lexer::Lexer() {
// calculate the capture group count for each syntax element
captureGroupCounts.reserve(syntax.size());
for (auto element : syntax) {
std::regex regex("(" + element.second + ")|.*");
std::string blank = "";
auto matches = std::sregex_iterator(blank.begin(), blank.end(), regex);
auto match = *matches;
captureGroupCounts.push_back(match.size() - 1);
}
}
std::vector<Token> Lexer::tokenize(const std::string& source) {
if (source[0] == ' ')
throw Error(INDENTED_FILE);
std::string fullRegexString = "";
for (const auto& element : syntax)
fullRegexString += "(" + element.second + ")|";
fullRegexString.pop_back();
std::regex regex(fullRegexString);
auto matches = std::sregex_iterator(source.begin(), source.end(), regex);
for (auto it = matches; it != std::sregex_iterator(); ++it) {
uint pos = static_cast<uint>(it->position());
for (uint i = 0; i < it->size(); ++i) {
if (it->str(i + 1).empty())
continue;
uint index = indexFromGroup(i);
if (syntax[index].first == Token::WHITESPACE)
continue;
if (syntax[index].first == Token::COMMENT)
continue;
if (syntax[index].first == Token::INVALID)
throw Error(UNKNOWN_SYMBOL, std::to_string(pos));
tokens.push_back({syntax[index].first, it->str(), pos});
break;
}
}
tokens.push_back({Token::LAST, "", static_cast<uint>(source.size())});
return std::move(tokens);
}
uint Lexer::indexFromGroup(uint group) {
uint sum = 0;
uint index = 0;
while (sum < group) {
sum += captureGroupCounts[index];
++index;
}
return index;
}
OCA_END