diff --git a/regression-tests/pure2-raw-string-literal-and-interpolation.cpp2 b/regression-tests/pure2-raw-string-literal-and-interpolation.cpp2 new file mode 100644 index 0000000000..ddddae5178 --- /dev/null +++ b/regression-tests/pure2-raw-string-literal-and-interpolation.cpp2 @@ -0,0 +1,34 @@ +main: () -> int = { + i := 42; + m : std::map = (); + m["one"] = 1; + m["two"] = 2; + + str : std::string = "this is a string"; + + raw_str : std::string = R"string(raw string without interpolation)string"; + + raw_str_multi : std::string = R"test(this is raw string literal + +that can last for multiple + +lines)test"; + + raw_str_inter : std::string = $R"test(this is raw string literal +that can last for multiple +lines +(i)$ R"(this can be added too)" +calculations like m["one"] + m["two"] = (m["one"] + m["two"])$ also works +("at the beginning of the line")$!!!)test"; + + raw_str_inter_multi : std::string = $R"( + + )" + $R"((i)$)" + $R"((i)$)"; + + std::cout << str << std::endl; + std::cout << raw_str << std::endl; + std::cout << raw_str_multi << std::endl; + std::cout << raw_str_inter << std::endl; + std::cout << raw_str_inter_multi << std::endl; + std::cout << ($R"((m["one"])$.)" + $R"((m["two"])$.)" + $R"((m["three"])$.)" + $R"((i)$)") << std::endl; +} diff --git a/regression-tests/test-results/apple-clang-14/pure2-raw-string-literal-and-interpolation.cpp.execution b/regression-tests/test-results/apple-clang-14/pure2-raw-string-literal-and-interpolation.cpp.execution new file mode 100644 index 0000000000..c48e2bd603 --- /dev/null +++ b/regression-tests/test-results/apple-clang-14/pure2-raw-string-literal-and-interpolation.cpp.execution @@ -0,0 +1,17 @@ +this is a string +raw string without interpolation +this is raw string literal + +that can last for multiple + +lines +this is raw string literal +that can last for multiple +lines +42 R"(this can be added too)" +calculations like m["one"] + m["two"] = 3 also works +at the beginning of the line!!! + + + 4242 +1.2.0.42 diff --git a/regression-tests/test-results/apple-clang-14/pure2-raw-string-literal-and-interpolation.cpp.output b/regression-tests/test-results/apple-clang-14/pure2-raw-string-literal-and-interpolation.cpp.output new file mode 100644 index 0000000000..e69de29bb2 diff --git a/regression-tests/test-results/clang-12/pure2-raw-string-literal-and-interpolation.cpp.execution b/regression-tests/test-results/clang-12/pure2-raw-string-literal-and-interpolation.cpp.execution new file mode 100644 index 0000000000..c48e2bd603 --- /dev/null +++ b/regression-tests/test-results/clang-12/pure2-raw-string-literal-and-interpolation.cpp.execution @@ -0,0 +1,17 @@ +this is a string +raw string without interpolation +this is raw string literal + +that can last for multiple + +lines +this is raw string literal +that can last for multiple +lines +42 R"(this can be added too)" +calculations like m["one"] + m["two"] = 3 also works +at the beginning of the line!!! + + + 4242 +1.2.0.42 diff --git a/regression-tests/test-results/clang-12/pure2-raw-string-literal-and-interpolation.cpp.output b/regression-tests/test-results/clang-12/pure2-raw-string-literal-and-interpolation.cpp.output new file mode 100644 index 0000000000..e69de29bb2 diff --git a/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp b/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp new file mode 100644 index 0000000000..d668278f21 --- /dev/null +++ b/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp @@ -0,0 +1,47 @@ + +#define CPP2_USE_MODULES Yes + +#include "cpp2util.h" + + +#line 1 "pure2-raw-string-literal-and-interpolation.cpp2" +[[nodiscard]] auto main() -> int; + +//=== Cpp2 definitions ========================================================== + +#line 1 "pure2-raw-string-literal-and-interpolation.cpp2" +[[nodiscard]] auto main() -> int{ + auto i {42}; + std::map m {}; + cpp2::assert_in_bounds(m, "one") = 1; + cpp2::assert_in_bounds(m, "two") = 2; + + std::string str {"this is a string"}; + + std::string raw_str {R"string(raw string without interpolation)string"}; + + std::string raw_str_multi {R"test(this is raw string literal + +that can last for multiple + +lines)test"}; + + std::string raw_str_inter {R"test(this is raw string literal +that can last for multiple +lines +)test" + cpp2::to_string(i) + R"test( R"(this can be added too)" +calculations like m["one"] + m["two"] = )test" + cpp2::to_string(cpp2::assert_in_bounds(m, "one") + cpp2::assert_in_bounds(m, "two")) + R"test( also works +)test" + cpp2::to_string("at the beginning of the line") + R"test(!!!)test"}; + + std::string raw_str_inter_multi {R"( + + )" + cpp2::to_string(i) + cpp2::to_string(i)}; + + std::cout << std::move(str) << std::endl; + std::cout << std::move(raw_str) << std::endl; + std::cout << std::move(raw_str_multi) << std::endl; + std::cout << std::move(raw_str_inter) << std::endl; + std::cout << std::move(raw_str_inter_multi) << std::endl; + std::cout << (cpp2::to_string(cpp2::assert_in_bounds(m, "one")) + R"(.)" + cpp2::to_string(cpp2::assert_in_bounds(m, "two")) + R"(.)" + cpp2::to_string(cpp2::assert_in_bounds(std::move(m), "three")) + R"(.)" + cpp2::to_string(std::move(i))) << std::endl; +} + diff --git a/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp2.output b/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp2.output new file mode 100644 index 0000000000..b264eb8099 --- /dev/null +++ b/regression-tests/test-results/pure2-raw-string-literal-and-interpolation.cpp2.output @@ -0,0 +1,2 @@ +pure2-raw-string-literal-and-interpolation.cpp2... ok (all Cpp2, passes safety checks) + diff --git a/source/common.h b/source/common.h index ec051c359e..e1bcf2b148 100644 --- a/source/common.h +++ b/source/common.h @@ -100,6 +100,131 @@ struct comment std::string text; }; +struct string_parts { + struct cpp_code { std::string text; }; + struct raw_string { std::string text; }; + enum adds_sequences { no_ends = 0, on_the_begining = 1, on_the_end = 2, on_both_ends = 3 }; + + string_parts(const std::string& beginseq, + const std::string& endseq, + adds_sequences strateg) + : begin_seq{beginseq} + , end_seq{endseq} + , strategy{strateg} + { + if (!(strategy & on_the_begining)) { + parts.push_back(raw_string{""}); + } + } + + void add_code(const std::string& text) { parts.push_back(cpp_code{text});} + void add_string(const std::string& text) { parts.push_back(raw_string{text});} + void add_string(const std::string_view& text) { parts.push_back(raw_string{std::string(text)});} + + void clear() { parts.clear(); } + + + auto generate() const -> std::string { + + if (parts.empty()) { + return (strategy & on_the_begining ? begin_seq : std::string{}) + + (strategy & on_the_end ? end_seq : std::string{}); + } + + auto result = std::visit(begin_visit{begin_seq, strategy}, + parts.front()); + + if (std::ssize(parts) > 1) { + auto it1 = parts.cbegin(); + auto it2 = parts.cbegin()+1; + for(;it2 != parts.cend(); ++it1, ++it2) { + result += std::visit(generator_visit{begin_seq, end_seq}, *it1, *it2); + } + } + + if (!(strategy & on_the_end)) { + result += std::visit([this](const auto& lhs) { + return generator_visit{begin_seq, end_seq}(lhs, raw_string{""}); + }, parts.back()); + } + + result += std::visit(end_visit{end_seq, strategy}, parts.back()); + + return result; + } + + auto is_expanded() const -> bool { + for (const auto& p : parts) { + if (std::holds_alternative(p)) { + return true; + } + } + return false; + } + +private: + std::string begin_seq; + std::string end_seq; + adds_sequences strategy; + std::vector> parts; + + struct begin_visit { + std::string begin_seq; + adds_sequences strategy; + + auto operator()(const raw_string& part) const -> std::string { + return (strategy & on_the_begining ? begin_seq : "") + part.text; + } + auto operator()(const cpp_code& part) const -> std::string { + return part.text; + } + }; + + struct end_visit { + std::string end_seq; + adds_sequences strategy; + auto operator()(const raw_string&) const -> std::string { + return strategy & on_the_end ? end_seq : ""; + } + auto operator()(const cpp_code&) const -> std::string { + return {}; + } + }; + + struct generator_visit { + std::string begin_seq; + std::string end_seq; + + auto operator()(const raw_string&, const cpp_code& part ) const -> std::string { + return end_seq + " + " + part.text; + } + auto operator()(const cpp_code&, const raw_string& part ) const -> std::string { + return " + " + begin_seq + part.text; + } + auto operator()(const raw_string&, const raw_string& part ) const -> std::string { + return part.text; + } + auto operator()(const cpp_code&, const cpp_code& part ) const -> std::string { + return " + " + part.text; + } + }; +}; + +struct raw_string +{ + source_position start; + std::string text; + std::string opening_seq; + std::string closing_seq; + bool should_interpolate = false; +}; + +struct multiline_raw_string +{ + std::string text; + source_position end = {0, 0}; +}; + //----------------------------------------------------------------------- // // error: represents a user-readable error message diff --git a/source/lex.h b/source/lex.h index 70b180d86d..2f706794e7 100644 --- a/source/lex.h +++ b/source/lex.h @@ -349,7 +349,6 @@ auto expand_string_literal( } auto pos = 0; - auto ret = std::string{}; // the return string we're going to build // Skip prefix to first non-" character while ( @@ -363,28 +362,10 @@ auto expand_string_literal( pos < length && text[pos] == '"' ); - auto first_quote_pos = pos; ++pos; auto current_start = pos; // the current offset before which the string has been added to ret - auto first = true; - auto add_plus = [&](bool and_quote = false) { - // Only add a "+" after the first term - if (!first) { - ret += " + "; - } - // If also adding a quote, - if (and_quote) { - // remember to pull in the prefix on the first term (e.g., u") - if (first) { - ret += text.substr(0, first_quote_pos+1); - } - else { - ret += '"'; - } - } - first = false; - }; + auto parts = string_parts{"\"", "\"", string_parts::on_both_ends}; // Now we're on the first character of the string itself for ( @@ -428,17 +409,13 @@ auto expand_string_literal( // Put the next non-empty non-interpolated chunk straight into ret if (open != current_start) { - add_plus(true); - ret += text.substr(current_start, open - current_start); - ret += '"'; + parts.add_string(text.substr(current_start, open - current_start)); } // Then put interpolated chunk into ret - add_plus(); - ret += "cpp2::to_string"; auto chunk = std::string{text.substr(open, pos - open)}; replace_all(chunk, "\\\"", "\""); - ret += chunk; + parts.add_code("cpp2::to_string" + chunk); current_start = pos+1; } @@ -451,18 +428,77 @@ auto expand_string_literal( ); // Put the final non-interpolated chunk straight into ret - if ( - first - || current_start < std::ssize(text)-1 - ) - { - add_plus(true); - ret += text.substr(current_start); + if (current_start < std::ssize(text)-1) { + parts.add_string(text.substr(current_start, std::ssize(text)-current_start-1)); } - return ret; + return parts.generate(); } +auto expand_raw_string_literal( + const std::string& opening_seq, + const std::string& closing_seq, + string_parts::adds_sequences closing_strategy, + std::string_view text, + std::vector& errors, + source_position src_pos) -> string_parts +{ + auto const length = std::ssize(text); + auto pos = 0; + auto current_start = pos; // the current offset before which the string has been added to ret + string_parts parts{opening_seq, closing_seq, closing_strategy}; + + // Now we're on the first character of the string itself + for ( ; pos < length; ++pos ) + { + // Find the next )$ + if (text[pos] == '$' && text[pos-1] == ')') + { + // Scan back to find the matching ( + auto paren_depth = 1; + auto open = pos - 2; + + for( ; open > current_start; --open) + { + if (text[open] == ')') { + ++paren_depth; + } + else if (text[open] == '(') { + --paren_depth; + if (paren_depth == 0) { + break; + } + } + } + if (text[open] != '(') + { + errors.emplace_back( + source_position( src_pos.lineno, src_pos.colno + pos ), + "no matching ( for string interpolation ending in )$" + ); + return parts; + } + + // 'open' is now at the matching ( + + // Put the next non-empty non-interpolated chunk straight into ret + if (open != current_start) { + parts.add_string(text.substr(current_start, open - current_start)); + } + // Then put interpolated chunk into ret + parts.add_code("cpp2::to_string" + std::string{text.substr(open, pos - open)}); + + current_start = pos+1; + } + } + + // Put the final non-interpolated chunk straight into ret + if (current_start < std::ssize(text)) { + parts.add_string(text.substr(current_start)); + } + + return parts; +} //----------------------------------------------------------------------- // lex: Tokenize a single line while maintaining inter-line state @@ -482,6 +518,8 @@ auto expand_string_literal( // -- this isn't about tokens generated later, that's tokens::generated_tokens static auto generated_text = std::deque{}; +static auto multiline_raw_strings = std::deque{}; + auto lex_line( std::string& mutable_line, int const lineno, @@ -490,7 +528,8 @@ auto lex_line( source_position& current_comment_start, std::vector& tokens, std::vector& comments, - std::vector& errors + std::vector& errors, + std::optional& raw_string_multiline ) -> bool { @@ -915,6 +954,49 @@ auto lex_line( return do_is_keyword(multi_keys); }; + auto reset_processing_of_the_line = [&]() { + // Redo processing of this whole line now that the string is expanded, + // which may have moved it in memory... move i back to the line start + // and discard any tokens we already tokenized for this line + i = colno_t{-1}; + while ( + !tokens.empty() + && tokens.back().position().lineno == lineno + ) + { + tokens.pop_back(); + } + }; + + auto interpolate_raw_string = [&]( + const std::string& opening_seq, + const std::string& closing_seq, + string_parts::adds_sequences closing_strategy, + std::string_view part, + int pos_to_replace, + int size_to_replace + ) -> bool { + auto parts = expand_raw_string_literal(opening_seq, closing_seq, closing_strategy, part, errors, source_position(lineno, pos_to_replace + 1)); + auto new_part = parts.generate(); + mutable_line.replace( pos_to_replace, size_to_replace, new_part ); + i += std::ssize(new_part)-1; + + if (parts.is_expanded()) { + // raw string was expanded and we need to repeat the processing of this line + reset_processing_of_the_line(); + + // but skipping end of potential multiline raw string that ends on this line + if (!multiline_raw_strings.empty() && multiline_raw_strings.back().end.lineno == lineno) { + i = multiline_raw_strings.back().end.colno; + raw_string_multiline.reset(); + } else if (raw_string_multiline && raw_string_multiline->start.lineno == lineno) { + raw_string_multiline.reset(); + } + return true; + } + return false; + }; + // //----------------------------------------------------- @@ -925,14 +1007,25 @@ auto lex_line( auto peek3 = peek(3); //G encoding-prefix: one of - //G 'u8' 'u' + //G 'u8' 'u' 'uR' 'u8R' 'U' 'UR' 'L' 'LR' 'R' //G auto is_encoding_prefix_and = [&](char next) { - if (line[i] == next) { return 1; } + if (line[i] == next) { return 1; } // " else if (line[i] == 'u') { - if (peek1 == next) { return 2; } - else if (peek1 == '8' && peek2 == next) { return 3; } - } + if (peek1 == next) { return 2; } // u" + else if (peek1 == '8' && peek2 == next) { return 3; } // u8" + else if (peek1 == 'R' && peek2 == next) { return 3; } // uR" + else if (peek1 == '8' && peek2 == 'R' && peek3 == next) { return 4; } // u8R" + } + else if (line[i] == 'U') { + if ( peek1 == next) { return 2; } // U" + else if (peek1 == 'R' && peek2 == next) { return 3; } // UR" + } + else if (line[i] == 'L') { + if ( peek1 == next ) { return 2; } // L" + else if (peek1 == 'R' && peek2 == next) { return 3; } // LR" + } + else if (line[i] == 'R' && peek1 == next) { return 2; } // R" return 0; }; @@ -958,6 +1051,46 @@ auto lex_line( current_comment += line[i]; } } + else if (raw_string_multiline) { + auto end_pos = line.find(raw_string_multiline.value().closing_seq, i); + auto part = line.substr(i, end_pos-i); + + if (const auto& rsm = raw_string_multiline.value(); rsm.should_interpolate) { + + auto closing_strategy = end_pos == line.npos ? string_parts::no_ends : string_parts::on_the_end; + auto size_to_replace = end_pos == line.npos ? std::ssize(line) - i : end_pos - i + std::ssize(rsm.closing_seq); + + if (interpolate_raw_string(rsm.opening_seq, rsm.closing_seq, closing_strategy, part, i, size_to_replace ) ) { + continue; + } + } + // raw string was not expanded + + raw_string_multiline.value().text += part; + if (end_pos == std::string::npos) { + raw_string_multiline.value().text += '\n'; + break; + } + + // here we know that we are dealing with finalized multiline raw string literal + // token needs to use multiline_raw_strings to store string that exists in multiple lines + raw_string_multiline.value().text += raw_string_multiline.value().closing_seq; + + // and position where multiline_raw_string ends (needed for reseting line parsing) + i = end_pos+std::ssize(raw_string_multiline.value().closing_seq)-1; + + const auto& text = raw_string_multiline.value().should_interpolate ? raw_string_multiline.value().text.substr(1) : raw_string_multiline.value().text; + multiline_raw_strings.emplace_back(multiline_raw_string{ text, {lineno, i} }); + + tokens.push_back({ + &multiline_raw_strings.back().text[0], + std::ssize(multiline_raw_strings.back().text), + raw_string_multiline.value().start, + lexeme::StringLiteral + }); + raw_string_multiline.reset(); + continue; + } // Otherwise, we will be at the start of a token, a comment, or whitespace // @@ -1142,7 +1275,62 @@ auto lex_line( store(1, lexeme::QuestionMark); break;case '$': - store(1, lexeme::Dollar); + if (peek1 == 'R' && peek2 == '"') { + // if peek(j-2) is 'R' it means that we deal with raw-string literal + auto R_pos = i + 1; + auto seq_pos = i + 3; + + if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) { + auto opening_seq = line.substr(i, paren_pos - i + 1); + auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\""; + + if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) { + if (interpolate_raw_string( + opening_seq, + closing_seq, + string_parts::on_both_ends, + std::string_view(&line[paren_pos+1], closing_pos-paren_pos-1), i, closing_pos-i+std::ssize(closing_seq)) + ) { + continue; + } + + tokens.push_back({ + &line[R_pos], + i - R_pos + 1, + source_position(lineno, R_pos + 1), + lexeme::StringLiteral + }); + } else { + raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq, true }); + + if (interpolate_raw_string( + opening_seq, + closing_seq, + string_parts::on_the_begining, + std::string_view(&line[paren_pos+1], std::ssize(line)-(paren_pos+1)), i, std::ssize(line)-i) + ) { + continue; + } + // skip entire raw string opening sequence R" + i = paren_pos; + + // if we are on the end of the line we need to add new line char + if (i+1 == std::ssize(line)) { + raw_string_multiline.value().text += '\n'; + } + } + continue; + } + else { + errors.emplace_back( + source_position(lineno, i + 1), + "invalid new-line in raw string delimiter \"" + std::string(&line[i],3) + + "\" - stray 'R' in program \"" + ); + } + } else { + store(1, lexeme::Dollar); + } //G //G literal: @@ -1284,54 +1472,80 @@ auto lex_line( //G string-literal: //G encoding-prefix? '"' s-char-seq? '"' + //G encoding-prefix? 'R"' d-char-seq? '(' s-char-seq? ')' d-char-seq? '"' //G //G s-char-seq: //G interpolation? s-char //G interpolation? s-char-seq s-char //G + //G d-char-seq: + //G d-char + //G //G interpolation: //G '(' expression ')' '$' //G else if (auto j = is_encoding_prefix_and('\"')) { - while (auto len = peek_is_sc_char(j, '\"')) { j += len; } - if (peek(j) != '\"') { - errors.emplace_back( - source_position(lineno, i), - "string literal \"" + std::string(&line[i+1],j) - + "\" is missing its closing \"" - ); - } - - // At this point we have a string-literal, but it may contain - // captures/interpolations we want to tokenize - auto literal = std::string_view{ &line[i], std::size_t(j+1) }; - auto s = expand_string_literal( literal, errors, source_position(lineno, i + 1) ); - - // If there are no captures/interpolations, just store it directly and continue - if (std::ssize(s) == j+1) { - store(j+1, lexeme::StringLiteral); + // if peek(j-2) is 'R' it means that we deal with raw-string literal + if (peek(j-2) == 'R') { + auto seq_pos = i + j; + + if (auto paren_pos = line.find("(", seq_pos); paren_pos != std::string::npos) { + auto opening_seq = line.substr(i, paren_pos - i + 1); + auto closing_seq = ")"+line.substr(seq_pos, paren_pos-seq_pos)+"\""; + + if (auto closing_pos = line.find(closing_seq, paren_pos+1); closing_pos != line.npos) { + store(closing_pos+std::ssize(closing_seq)-i, lexeme::StringLiteral); + } else { + raw_string_multiline.emplace(raw_string{source_position{lineno, i}, opening_seq, opening_seq, closing_seq }); + // skip entire raw string opening sequence R" + i = paren_pos; + + // if we are on the end of the line we need to add new line char + if (i+1 == std::ssize(line)) { + raw_string_multiline.value().text += '\n'; + } + } + continue; + } + else { + errors.emplace_back( + source_position(lineno, i + j - 2), + "invalid new-line in raw string delimiter \"" + std::string(&line[i],j) + + "\" - stray 'R' in program \"" + ); + } } - // Otherwise, replace it with the expanded version and continue else { - if (std::ssize(s) <= j + 1) { + while (auto len = peek_is_sc_char(j, '\"')) { j += len; } + if (peek(j) != '\"') { errors.emplace_back( - source_position( lineno, i ), - "not a legal string literal" + source_position(lineno, i), + "string literal \"" + std::string(&line[i+1],j) + + "\" is missing its closing \"" ); - return {}; } - mutable_line.replace( i, j+1, s ); - - // Redo processing of this whole line now that the string is expanded, - // which may have moved it in memory... move i back to the line start - // and discard any tokens we already tokenized for this line - i = colno_t{-1}; - while ( - !tokens.empty() - && tokens.back().position().lineno == lineno - ) - { - tokens.pop_back(); + + // At this point we have a string-literal, but it may contain + // captures/interpolations we want to tokenize + auto literal = std::string_view{ &line[i], std::size_t(j+1) }; + auto s = expand_string_literal( literal, errors, source_position(lineno, i + 1) ); + + // If there are no captures/interpolations, just store it directly and continue + if (std::ssize(s) == j+1) { + store(j+1, lexeme::StringLiteral); + } + // Otherwise, replace it with the expanded version and continue + else { + if (std::ssize(s) <= j + 1) { + errors.emplace_back( + source_position( lineno, i ), + "not a legal string literal" + ); + return {}; + } + mutable_line.replace( i, j+1, s ); + + reset_processing_of_the_line(); } } } @@ -1438,6 +1652,9 @@ auto lex_line( if (in_comment) { current_comment += "\n"; } + if (raw_string_multiline && line.size() == 0) { + raw_string_multiline.value().text += '\n'; + } assert (std::ssize(tokens) >= original_size); return std::ssize(tokens) != original_size; @@ -1494,6 +1711,8 @@ class tokens -> void { auto in_comment = false; + auto raw_string_multiline = std::optional(); + assert (std::ssize(lines) > 0); auto line = std::begin(lines)+1; while (line != std::end(lines)) { @@ -1522,7 +1741,8 @@ class tokens lex_line( line->text, lineno, in_comment, current_comment, current_comment_start, - entry, comments, errors + entry, comments, errors, + raw_string_multiline ); }