Skip to content

Commit 19854fa

Browse files
committed
Add raw string interpolation support for cpp2
Raw-string literals that starts with $ (dollar sign) will interpolate. That means that following code: ```cpp rs := $R"(m["one"] + m["two"] = (m["one"] + m["two"])$)"; ``` will generate follwing cpp1 code: ```cpp auto rs { R"(m["one"] + m["two"] = )" + cpp2::to_string(cpp2::assert_in_bounds(m, "one") + cpp2::assert_in_bounds(m, "two")) }; ``` It handles raw strings in single line and in multiple lines. It process line by one and stores parts of multiline raw string in separate buffer (multiline_raw_strings).
1 parent 98e2127 commit 19854fa

File tree

2 files changed

+196
-19
lines changed

2 files changed

+196
-19
lines changed

source/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,13 @@ struct raw_string
214214
std::string text;
215215
std::string opening_seq;
216216
std::string closing_seq;
217+
bool should_interpolate = false;
218+
};
219+
220+
struct multiline_raw_string
221+
{
222+
std::string text;
223+
source_position end = {0, 0};
217224
};
218225

219226
//-----------------------------------------------------------------------

source/lex.h

Lines changed: 189 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,71 @@ auto expand_string_literal(std::string_view text, std::vector<error>& errors, so
394394
return parts.generate();
395395
}
396396

397+
auto expand_raw_string_literal(
398+
const std::string& opening_seq,
399+
const std::string& closing_seq,
400+
string_parts::adds_sequences closing_strategy,
401+
std::string_view text,
402+
std::vector<error>& errors,
403+
source_position src_pos) -> string_parts
404+
{
405+
auto const length = std::ssize(text);
406+
auto pos = 0;
407+
auto first_quote_pos = pos;
408+
auto current_start = pos; // the current offset before which the string has been added to ret
409+
string_parts parts{opening_seq, closing_seq, closing_strategy};
410+
411+
// Now we're on the first character of the string itself
412+
for ( ; pos < length; ++pos )
413+
{
414+
// Find the next )$
415+
if (text[pos] == '$' && text[pos-1] == ')')
416+
{
417+
// Scan back to find the matching (
418+
auto paren_depth = 1;
419+
auto open = pos - 2;
420+
421+
for( ; open > current_start; --open)
422+
{
423+
if (text[open] == ')') {
424+
++paren_depth;
425+
}
426+
else if (text[open] == '(') {
427+
--paren_depth;
428+
if (paren_depth == 0) {
429+
break;
430+
}
431+
}
432+
}
433+
if (text[open] != '(')
434+
{
435+
errors.emplace_back(
436+
source_position( src_pos.lineno, src_pos.colno + pos ),
437+
"no matching ( for string interpolation ending in )$"
438+
);
439+
return parts;
440+
}
441+
442+
// 'open' is now at the matching (
443+
444+
// Put the next non-empty non-interpolated chunk straight into ret
445+
if (open != current_start) {
446+
parts.add_string(text.substr(current_start, open - current_start));
447+
}
448+
// Then put interpolated chunk into ret
449+
parts.add_code("cpp2::to_string" + std::string{text.substr(open, pos - open)});
450+
451+
current_start = pos+1;
452+
}
453+
}
454+
455+
// Put the final non-interpolated chunk straight into ret
456+
if (current_start < std::ssize(text)) {
457+
parts.add_string(text.substr(current_start));
458+
}
459+
460+
return parts;
461+
}
397462

398463
//-----------------------------------------------------------------------
399464
// lex: Tokenize a single line while maintaining inter-line state
@@ -413,6 +478,8 @@ auto expand_string_literal(std::string_view text, std::vector<error>& errors, so
413478
// -- this isn't about tokens generated later, that's tokens::generated_tokens
414479
static auto generated_text = std::deque<std::string>{};
415480

481+
static auto multiline_raw_strings = std::deque<multiline_raw_string>{};
482+
416483
auto lex_line(
417484
std::string& mutable_line,
418485
int const lineno,
@@ -782,6 +849,45 @@ auto lex_line(
782849
return do_is_keyword(multi_keys);
783850
};
784851

852+
auto reset_processing_of_the_line = [&]() {
853+
// Redo processing of this whole line now that the string is expanded,
854+
// which may have moved it in memory... move i back to the line start
855+
// and discard any tokens we already tokenized for this line
856+
i = colno_t{-1};
857+
while (!tokens.empty() && tokens.back().position().lineno == lineno) {
858+
tokens.pop_back();
859+
}
860+
};
861+
862+
auto interpolate_raw_string = [&](
863+
const std::string& opening_seq,
864+
const std::string& closing_seq,
865+
string_parts::adds_sequences closing_strategy,
866+
std::string_view part,
867+
int pos_to_replace,
868+
int size_to_replace
869+
) -> bool {
870+
auto parts = expand_raw_string_literal(opening_seq, closing_seq, closing_strategy, part, errors, source_position(lineno, pos_to_replace + 1));
871+
auto new_part = parts.generate();
872+
mutable_line.replace( pos_to_replace, size_to_replace, new_part );
873+
i += std::ssize(new_part)-1;
874+
875+
if (parts.is_expanded()) {
876+
// raw string was expanded and we need to repeat the processing of this line
877+
reset_processing_of_the_line();
878+
879+
// but skipping end of potential multiline raw string that ends on this line
880+
if (!multiline_raw_strings.empty() && multiline_raw_strings.back().end.lineno == lineno) {
881+
i = multiline_raw_strings.back().end.colno;
882+
raw_string_multiline.reset();
883+
} else if (raw_string_multiline && raw_string_multiline->start.lineno == lineno) {
884+
raw_string_multiline.reset();
885+
}
886+
return true;
887+
}
888+
return false;
889+
};
890+
785891
//
786892
//-----------------------------------------------------
787893

@@ -811,6 +917,7 @@ auto lex_line(
811917
else if (peek1 == 'R' && peek2 == next) { return 3; } // LR"
812918
}
813919
else if (line[i] == 'R' && peek1 == next) { return 2; } // R"
920+
else if (line[i] == '$' && peek1 == 'R' && peek2 == next) { return 3; } // $R"
814921
return 0;
815922
};
816923

@@ -840,22 +947,36 @@ auto lex_line(
840947
auto end_pos = line.find(raw_string_multiline.value().closing_seq, i);
841948
auto part = line.substr(i, end_pos-i);
842949

950+
if (const auto& rsm = raw_string_multiline.value(); rsm.should_interpolate) {
951+
952+
auto closing_strategy = end_pos == line.npos ? string_parts::no_ends : string_parts::on_the_end;
953+
auto size_to_replace = end_pos == line.npos ? std::ssize(line) - i : end_pos - i + std::ssize(rsm.closing_seq);
954+
955+
if (interpolate_raw_string(rsm.opening_seq, rsm.closing_seq, closing_strategy, part, i, size_to_replace ) ) {
956+
continue;
957+
}
958+
}
959+
// raw string was not expanded
960+
843961
raw_string_multiline.value().text += part;
844962
if (end_pos == std::string::npos) {
845963
raw_string_multiline.value().text += '\n';
846964
break;
847965
}
848966

849-
// here we know that we are dealing with multiline raw string literal
850-
// token needs to use generated_text to store string that exists in multiple lines
851-
i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1;
967+
// here we know that we are dealing with finalized multiline raw string literal
968+
// token needs to use multiline_raw_strings to store string that exists in multiple lines
852969
raw_string_multiline.value().text += raw_string_multiline.value().closing_seq;
853970

854-
generated_text.push_back(raw_string_multiline.value().text);
971+
// and position where multiline_raw_string ends (needed for reseting line parsing)
972+
i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1;
973+
974+
const auto& text = raw_string_multiline.value().should_interpolate ? raw_string_multiline.value().text.substr(1) : raw_string_multiline.value().text;
975+
multiline_raw_strings.emplace_back(multiline_raw_string{ text, {lineno, i} });
855976

856977
tokens.push_back({
857-
&generated_text.back()[0],
858-
std::ssize(generated_text.back()),
978+
&multiline_raw_strings.back().text[0],
979+
std::ssize(multiline_raw_strings.back().text),
859980
raw_string_multiline.value().start,
860981
lexeme::StringLiteral
861982
});
@@ -1046,7 +1167,62 @@ auto lex_line(
10461167
store(1, lexeme::QuestionMark);
10471168

10481169
break;case '$':
1049-
store(1, lexeme::Dollar);
1170+
if (auto j = is_encoding_prefix_and('\"'); peek(j-2) == 'R') {
1171+
// if peek(j-2) is 'R' it means that we deal with raw-string literal
1172+
auto R_pos = i + j - 2;
1173+
auto seq_pos = i + j;
1174+
1175+
if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) {
1176+
auto opening_seq = line.substr(i, paren_pos - i + 1);
1177+
auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
1178+
1179+
if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) {
1180+
if (interpolate_raw_string(
1181+
opening_seq,
1182+
closing_seq,
1183+
string_parts::on_both_ends,
1184+
std::string_view(&line[paren_pos+1], closing_pos-paren_pos-1), i, closing_pos-i+std::ssize(closing_seq))
1185+
) {
1186+
continue;
1187+
}
1188+
1189+
tokens.push_back({
1190+
&line[R_pos],
1191+
i - R_pos + 1,
1192+
source_position(lineno, R_pos + 1),
1193+
lexeme::StringLiteral
1194+
});
1195+
} else {
1196+
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq, true });
1197+
1198+
if (interpolate_raw_string(
1199+
opening_seq,
1200+
closing_seq,
1201+
string_parts::on_the_begining,
1202+
std::string_view(&line[paren_pos+1], std::ssize(line)-(paren_pos+1)), i, std::ssize(line)-i)
1203+
) {
1204+
continue;
1205+
}
1206+
// skip entire raw string opening sequence R"
1207+
i = paren_pos;
1208+
1209+
// if we are on the end of the line we need to add new line char
1210+
if (i+1 == std::ssize(line)) {
1211+
raw_string_multiline.value().text += '\n';
1212+
}
1213+
}
1214+
continue;
1215+
}
1216+
else {
1217+
errors.emplace_back(
1218+
source_position(lineno, i + j - 2),
1219+
"invalid new-line in raw string delimiter \"" + std::string(&line[i],j)
1220+
+ "\" - stray 'R' in program \""
1221+
);
1222+
}
1223+
} else {
1224+
store(1, lexeme::Dollar);
1225+
}
10501226

10511227
//G
10521228
//G literal:
@@ -1196,13 +1372,13 @@ auto lex_line(
11961372
auto seq_pos = i + j;
11971373

11981374
if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) {
1199-
auto raw_string_opening_seq = line.substr(i, paren_pos - i + 1);
1200-
auto raw_string_closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
1375+
auto opening_seq = line.substr(i, paren_pos - i + 1);
1376+
auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\"";
12011377

1202-
if (auto closing_pos = line.find(raw_string_closing_seq, paren_pos+1); closing_pos != line.npos) {
1203-
store(closing_pos+std::ssize(raw_string_closing_seq)-i, lexeme::StringLiteral);
1378+
if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) {
1379+
store(closing_pos+std::ssize(closing_seq)-i, lexeme::StringLiteral);
12041380
} else {
1205-
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, raw_string_opening_seq, raw_string_opening_seq, raw_string_closing_seq });
1381+
raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq });
12061382
// skip entire raw string opening sequence R"
12071383
i = paren_pos;
12081384

@@ -1245,13 +1421,7 @@ auto lex_line(
12451421
assert(std::ssize(s) > j+1);
12461422
mutable_line.replace( i, j+1, s );
12471423

1248-
// Redo processing of this whole line now that the string is expanded,
1249-
// which may have moved it in memory... move i back to the line start
1250-
// and discard any tokens we already tokenized for this line
1251-
i = colno_t{-1};
1252-
while (!tokens.empty() && tokens.back().position().lineno == lineno) {
1253-
tokens.pop_back();
1254-
}
1424+
reset_processing_of_the_line();
12551425
}
12561426
}
12571427
}

0 commit comments

Comments
 (0)