Skip to content

Commit f84679d

Browse files
committed
Add support for raw string literals in cpp2
1 parent 4952334 commit f84679d

File tree

2 files changed

+122
-41
lines changed

2 files changed

+122
-41
lines changed

source/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,9 @@ struct string_parts {
213213
struct raw_string
214214
{
215215
source_position start;
216-
source_position end;
217216
std::string text;
217+
std::string opening_seq;
218+
std::string closing_seq;
218219
};
219220

220221
//-----------------------------------------------------------------------

source/lex.h

Lines changed: 120 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,8 @@ auto lex_line(
463463
source_position& current_comment_start,
464464
std::vector<token>& tokens,
465465
std::vector<comment>& comments,
466-
std::vector<error>& errors
466+
std::vector<error>& errors,
467+
std::optional<raw_string>& raw_string_multiline
467468
)
468469
-> bool
469470
{
@@ -901,11 +902,22 @@ auto lex_line(
901902
//G 'u8' 'u'
902903
//G
903904
auto is_encoding_prefix_and = [&](char next) {
904-
if (line[i] == next) { return 1; }
905+
if (line[i] == next) { return 1; } // "
905906
else if (line[i] == 'u') {
906-
if (peek1 == next) { return 2; }
907-
else if (peek1 == '8' && peek2 == next) { return 3; }
908-
}
907+
if (peek1 == next) { return 2; } // u"
908+
else if (peek1 == '8' && peek2 == next) { return 3; } // u8"
909+
else if (peek1 == 'R' && peek2 == next) { return 3; } // uR"
910+
else if (peek1 == '8' && peek2 == 'R' && peek3 == next) { return 4; } // u8R"
911+
}
912+
else if (line[i] == 'U') {
913+
if ( peek1 == next) { return 2; } // U"
914+
else if (peek1 == 'R' && peek2 == next) { return 3; } // UR"
915+
}
916+
else if (line[i] == 'L') {
917+
if ( peek1 == next ) { return 2; } // L"
918+
else if (peek1 == 'R' && peek2 == next) { return 3; } // LR"
919+
}
920+
else if (line[i] == 'R' && peek1 == next) { return 2; } // R"
909921
return 0;
910922
};
911923

@@ -931,6 +943,32 @@ auto lex_line(
931943
current_comment += line[i];
932944
}
933945
}
946+
else if (raw_string_multiline) {
947+
auto end_pos = line.find(raw_string_multiline.value().closing_seq, i);
948+
auto part = line.substr(i, end_pos-i);
949+
950+
raw_string_multiline.value().text += part;
951+
if (end_pos == std::string::npos) {
952+
raw_string_multiline.value().text += '\n';
953+
break;
954+
}
955+
956+
// here we know that we are dealing with multiline raw string literal
957+
// token needs to use generated_text to store string that exists in multiple lines
958+
i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1;
959+
raw_string_multiline.value().text += raw_string_multiline.value().closing_seq;
960+
961+
generated_text.push_back(raw_string_multiline.value().text);
962+
963+
tokens.push_back({
964+
&generated_text.back()[0],
965+
std::ssize(generated_text.back()),
966+
raw_string_multiline.value().start,
967+
lexeme::StringLiteral
968+
});
969+
raw_string_multiline.reset();
970+
continue;
971+
}
934972

935973
// Otherwise, we will be at the start of a token, a comment, or whitespace
936974
//
@@ -1257,54 +1295,90 @@ auto lex_line(
12571295

12581296
//G string-literal:
12591297
//G encoding-prefix? '"' s-char-seq? '"'
1298+
//G encoding-prefix? 'R"' d-char-seq? '(' s-char-seq? ')' d-char-seq? '"'
12601299
//G
12611300
//G s-char-seq:
12621301
//G interpolation? s-char
12631302
//G interpolation? s-char-seq s-char
12641303
//G
1304+
//G d-char-seq:
1305+
//G d-char
1306+
//G
12651307
//G interpolation:
12661308
//G '(' expression ')' '$'
12671309
//G
12681310
else if (auto j = is_encoding_prefix_and('\"')) {
1269-
while (auto len = peek_is_sc_char(j, '\"')) { j += len; }
1270-
if (peek(j) != '\"') {
1271-
errors.emplace_back(
1272-
source_position(lineno, i),
1273-
"string literal \"" + std::string(&line[i+1],j)
1274-
+ "\" is missing its closing \""
1275-
);
1276-
}
1277-
1278-
// At this point we have a string-literal, but it may contain
1279-
// captures/interpolations we want to tokenize
1280-
auto literal = std::string_view{ &line[i], std::size_t(j+1) };
1281-
auto s = expand_string_literal( literal, errors, source_position(lineno, i + 1) );
1282-
1283-
// If there are no captures/interpolations, just store it directly and continue
1284-
if (std::ssize(s) == j+1) {
1285-
store(j+1, lexeme::StringLiteral);
1311+
// if peek(j-2) is 'R' it means that we deal with raw-string literal
1312+
if (peek(j-2) == 'R') {
1313+
auto seq_pos = i + j;
1314+
1315+
if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) {
1316+
auto raw_string_opening_seq = line.substr(i, paren_pos - i + 1);
1317+
auto raw_string_closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
1318+
1319+
if (auto closing_pos = line.find(raw_string_closing_seq, paren_pos+1); closing_pos != line.npos) {
1320+
store(closing_pos+std::ssize(raw_string_closing_seq)-i, lexeme::StringLiteral);
1321+
} else {
1322+
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, raw_string_opening_seq, raw_string_opening_seq, raw_string_closing_seq });
1323+
// skip entire raw string opening sequence R"
1324+
i = paren_pos;
1325+
1326+
// if we are on the end of the line we need to add new line char
1327+
if (i+1 == std::ssize(line)) {
1328+
raw_string_multiline.value().text += '\n';
1329+
}
1330+
}
1331+
continue;
1332+
}
1333+
else {
1334+
errors.emplace_back(
1335+
source_position(lineno, i + j - 2),
1336+
"invalid new-line in raw string delimiter \"" + std::string(&line[i],j)
1337+
+ "\" - stray 'R' in program \""
1338+
);
1339+
}
12861340
}
1287-
// Otherwise, replace it with the expanded version and continue
12881341
else {
1289-
if (std::ssize(s) <= j + 1) {
1342+
while (auto len = peek_is_sc_char(j, '\"')) { j += len; }
1343+
if (peek(j) != '\"') {
12901344
errors.emplace_back(
1291-
source_position( lineno, i ),
1292-
"not a legal string literal"
1345+
source_position(lineno, i),
1346+
"string literal \"" + std::string(&line[i+1],j)
1347+
+ "\" is missing its closing \""
12931348
);
1294-
return {};
12951349
}
1296-
mutable_line.replace( i, j+1, s );
1297-
1298-
// Redo processing of this whole line now that the string is expanded,
1299-
// which may have moved it in memory... move i back to the line start
1300-
// and discard any tokens we already tokenized for this line
1301-
i = colno_t{-1};
1302-
while (
1303-
!tokens.empty()
1304-
&& tokens.back().position().lineno == lineno
1305-
)
1306-
{
1307-
tokens.pop_back();
1350+
1351+
// At this point we have a string-literal, but it may contain
1352+
// captures/interpolations we want to tokenize
1353+
auto literal = std::string_view{ &line[i], std::size_t(j+1) };
1354+
auto s = expand_string_literal( literal, errors, source_position(lineno, i + 1) );
1355+
1356+
// If there are no captures/interpolations, just store it directly and continue
1357+
if (std::ssize(s) == j+1) {
1358+
store(j+1, lexeme::StringLiteral);
1359+
}
1360+
// Otherwise, replace it with the expanded version and continue
1361+
else {
1362+
if (std::ssize(s) <= j + 1) {
1363+
errors.emplace_back(
1364+
source_position( lineno, i ),
1365+
"not a legal string literal"
1366+
);
1367+
return {};
1368+
}
1369+
mutable_line.replace( i, j+1, s );
1370+
1371+
// Redo processing of this whole line now that the string is expanded,
1372+
// which may have moved it in memory... move i back to the line start
1373+
// and discard any tokens we already tokenized for this line
1374+
i = colno_t{-1};
1375+
while (
1376+
!tokens.empty()
1377+
&& tokens.back().position().lineno == lineno
1378+
)
1379+
{
1380+
tokens.pop_back();
1381+
}
13081382
}
13091383
}
13101384
}
@@ -1411,6 +1485,9 @@ auto lex_line(
14111485
if (in_comment) {
14121486
current_comment += "\n";
14131487
}
1488+
if (raw_string_multiline && line.size() == 0) {
1489+
raw_string_multiline.value().text += '\n';
1490+
}
14141491

14151492
assert (std::ssize(tokens) >= original_size);
14161493
return std::ssize(tokens) != original_size;
@@ -1467,6 +1544,8 @@ class tokens
14671544
-> void
14681545
{
14691546
auto in_comment = false;
1547+
auto raw_string_multiline = std::optional<raw_string>();
1548+
14701549
assert (std::ssize(lines) > 0);
14711550
auto line = std::begin(lines)+1;
14721551
while (line != std::end(lines)) {
@@ -1495,7 +1574,8 @@ class tokens
14951574
lex_line(
14961575
line->text, lineno,
14971576
in_comment, current_comment, current_comment_start,
1498-
entry, comments, errors
1577+
entry, comments, errors,
1578+
raw_string_multiline
14991579
);
15001580
}
15011581

0 commit comments

Comments
 (0)