@@ -463,7 +463,8 @@ auto lex_line(
463463 source_position& current_comment_start,
464464 std::vector<token>& tokens,
465465 std::vector<comment>& comments,
466- std::vector<error>& errors
466+ std::vector<error>& errors,
467+ std::optional<raw_string>& raw_string_multiline
467468)
468469 -> bool
469470{
@@ -901,11 +902,22 @@ auto lex_line(
901902 // G 'u8' 'u'
902903 // G
903904 auto is_encoding_prefix_and = [&](char next) {
904- if (line[i] == next) { return 1 ; }
905+ if (line[i] == next) { return 1 ; } // "
905906 else if (line[i] == ' u' ) {
906- if (peek1 == next) { return 2 ; }
907- else if (peek1 == ' 8' && peek2 == next) { return 3 ; }
908- }
907+ if (peek1 == next) { return 2 ; } // u"
908+ else if (peek1 == ' 8' && peek2 == next) { return 3 ; } // u8"
909+ else if (peek1 == ' R' && peek2 == next) { return 3 ; } // uR"
910+ else if (peek1 == ' 8' && peek2 == ' R' && peek3 == next) { return 4 ; } // u8R"
911+ }
912+ else if (line[i] == ' U' ) {
913+ if ( peek1 == next) { return 2 ; } // U"
914+ else if (peek1 == ' R' && peek2 == next) { return 3 ; } // UR"
915+ }
916+ else if (line[i] == ' L' ) {
917+ if ( peek1 == next ) { return 2 ; } // L"
918+ else if (peek1 == ' R' && peek2 == next) { return 3 ; } // LR"
919+ }
920+ else if (line[i] == ' R' && peek1 == next) { return 2 ; } // R"
909921 return 0 ;
910922 };
911923
@@ -931,6 +943,32 @@ auto lex_line(
931943 current_comment += line[i];
932944 }
933945 }
946+ else if (raw_string_multiline) {
947+ auto end_pos = line.find (raw_string_multiline.value ().closing_seq , i);
948+ auto part = line.substr (i, end_pos-i);
949+
950+ raw_string_multiline.value ().text += part;
951+ if (end_pos == std::string::npos) {
952+ raw_string_multiline.value ().text += ' \n ' ;
953+ break ;
954+ }
955+
956+ // here we know that we are dealing with multiline raw string literal
957+ // token needs to use generated_text to store string that exists in multiple lines
958+ i = end_pos+std::ssize (raw_string_multiline.value ().closing_seq )-1 ;
959+ raw_string_multiline.value ().text += raw_string_multiline.value ().closing_seq ;
960+
961+ generated_text.push_back (raw_string_multiline.value ().text );
962+
963+ tokens.push_back ({
964+ &generated_text.back ()[0 ],
965+ std::ssize (generated_text.back ()),
966+ raw_string_multiline.value ().start ,
967+ lexeme::StringLiteral
968+ });
969+ raw_string_multiline.reset ();
970+ continue ;
971+ }
934972
935973 // Otherwise, we will be at the start of a token, a comment, or whitespace
936974 //
@@ -1257,54 +1295,90 @@ auto lex_line(
12571295
12581296 // G string-literal:
12591297 // G encoding-prefix? '"' s-char-seq? '"'
1298+ // G encoding-prefix? 'R"' d-char-seq? '(' s-char-seq? ')' d-char-seq? '"'
12601299 // G
12611300 // G s-char-seq:
12621301 // G interpolation? s-char
12631302 // G interpolation? s-char-seq s-char
12641303 // G
1304+ // G d-char-seq:
1305+ // G d-char
1306+ // G
12651307 // G interpolation:
12661308 // G '(' expression ')' '$'
12671309 // G
12681310 else if (auto j = is_encoding_prefix_and (' \" ' )) {
1269- while (auto len = peek_is_sc_char (j, ' \" ' )) { j += len; }
1270- if (peek (j) != ' \" ' ) {
1271- errors.emplace_back (
1272- source_position (lineno, i),
1273- " string literal \" " + std::string (&line[i+1 ],j)
1274- + " \" is missing its closing \" "
1275- );
1276- }
1277-
1278- // At this point we have a string-literal, but it may contain
1279- // captures/interpolations we want to tokenize
1280- auto literal = std::string_view{ &line[i], std::size_t (j+1 ) };
1281- auto s = expand_string_literal ( literal, errors, source_position (lineno, i + 1 ) );
1282-
1283- // If there are no captures/interpolations, just store it directly and continue
1284- if (std::ssize (s) == j+1 ) {
1285- store (j+1 , lexeme::StringLiteral);
1311+ // if peek(j-2) is 'R' it means that we deal with raw-string literal
1312+ if (peek (j-2 ) == ' R' ) {
1313+ auto seq_pos = i + j;
1314+
1315+ if (auto paren_pos = line.find (" (" , seq_pos); paren_pos != std::string::npos) {
1316+ auto raw_string_opening_seq = line.substr (i, paren_pos - i + 1 );
1317+ auto raw_string_closing_seq = " )" +line.substr (seq_pos, paren_pos-seq_pos)+" \" " ;
1318+
1319+ if (auto closing_pos = line.find (raw_string_closing_seq, paren_pos+1 ); closing_pos != line.npos ) {
1320+ store (closing_pos+std::ssize (raw_string_closing_seq)-i, lexeme::StringLiteral);
1321+ } else {
1322+ raw_string_multiline.emplace (raw_string{source_position{lineno, i}, raw_string_opening_seq, raw_string_opening_seq, raw_string_closing_seq });
1323+ // skip entire raw string opening sequence R"
1324+ i = paren_pos;
1325+
1326+ // if we are on the end of the line we need to add new line char
1327+ if (i+1 == std::ssize (line)) {
1328+ raw_string_multiline.value ().text += ' \n ' ;
1329+ }
1330+ }
1331+ continue ;
1332+ }
1333+ else {
1334+ errors.emplace_back (
1335+ source_position (lineno, i + j - 2 ),
1336+ " invalid new-line in raw string delimiter \" " + std::string (&line[i],j)
1337+ + " \" - stray 'R' in program \" "
1338+ );
1339+ }
12861340 }
1287- // Otherwise, replace it with the expanded version and continue
12881341 else {
1289- if (std::ssize (s) <= j + 1 ) {
1342+ while (auto len = peek_is_sc_char (j, ' \" ' )) { j += len; }
1343+ if (peek (j) != ' \" ' ) {
12901344 errors.emplace_back (
1291- source_position ( lineno, i ),
1292- " not a legal string literal"
1345+ source_position (lineno, i),
1346+ " string literal \" " + std::string (&line[i+1 ],j)
1347+ + " \" is missing its closing \" "
12931348 );
1294- return {};
12951349 }
1296- mutable_line.replace ( i, j+1 , s );
1297-
1298- // Redo processing of this whole line now that the string is expanded,
1299- // which may have moved it in memory... move i back to the line start
1300- // and discard any tokens we already tokenized for this line
1301- i = colno_t {-1 };
1302- while (
1303- !tokens.empty ()
1304- && tokens.back ().position ().lineno == lineno
1305- )
1306- {
1307- tokens.pop_back ();
1350+
1351+ // At this point we have a string-literal, but it may contain
1352+ // captures/interpolations we want to tokenize
1353+ auto literal = std::string_view{ &line[i], std::size_t (j+1 ) };
1354+ auto s = expand_string_literal ( literal, errors, source_position (lineno, i + 1 ) );
1355+
1356+ // If there are no captures/interpolations, just store it directly and continue
1357+ if (std::ssize (s) == j+1 ) {
1358+ store (j+1 , lexeme::StringLiteral);
1359+ }
1360+ // Otherwise, replace it with the expanded version and continue
1361+ else {
1362+ if (std::ssize (s) <= j + 1 ) {
1363+ errors.emplace_back (
1364+ source_position ( lineno, i ),
1365+ " not a legal string literal"
1366+ );
1367+ return {};
1368+ }
1369+ mutable_line.replace ( i, j+1 , s );
1370+
1371+ // Redo processing of this whole line now that the string is expanded,
1372+ // which may have moved it in memory... move i back to the line start
1373+ // and discard any tokens we already tokenized for this line
1374+ i = colno_t {-1 };
1375+ while (
1376+ !tokens.empty ()
1377+ && tokens.back ().position ().lineno == lineno
1378+ )
1379+ {
1380+ tokens.pop_back ();
1381+ }
13081382 }
13091383 }
13101384 }
@@ -1411,6 +1485,9 @@ auto lex_line(
14111485 if (in_comment) {
14121486 current_comment += " \n " ;
14131487 }
1488+ if (raw_string_multiline && line.size () == 0 ) {
1489+ raw_string_multiline.value ().text += ' \n ' ;
1490+ }
14141491
14151492 assert (std::ssize (tokens) >= original_size);
14161493 return std::ssize (tokens) != original_size;
@@ -1467,6 +1544,8 @@ class tokens
14671544 -> void
14681545 {
14691546 auto in_comment = false ;
1547+ auto raw_string_multiline = std::optional<raw_string>();
1548+
14701549 assert (std::ssize (lines) > 0 );
14711550 auto line = std::begin (lines)+1 ;
14721551 while (line != std::end (lines)) {
@@ -1495,7 +1574,8 @@ class tokens
14951574 lex_line (
14961575 line->text , lineno,
14971576 in_comment, current_comment, current_comment_start,
1498- entry, comments, errors
1577+ entry, comments, errors,
1578+ raw_string_multiline
14991579 );
15001580 }
15011581
0 commit comments