diff --git a/Changelog.md b/Changelog.md index 698c7fb5d26c..f4ac02b29359 100644 --- a/Changelog.md +++ b/Changelog.md @@ -13,6 +13,7 @@ Compiler Features: Bugfixes: + * AsmParser: Source location comments no longer rely no regex to be parsed. Whitespaces between the indices as well as single-quoted code snippets are now allowed. * SMTChecker: Fix error that reports invalid number of verified checks for BMC and CHC engines. * SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine. * TypeChecker: Fix segfault when assigning nested tuple to tuple. diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 806efa311668..52e587746fda 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -826,31 +826,44 @@ Token Scanner::scanString(bool const _isUnicode) char const quote = m_char; advance(); // consume quote LiteralScope literal(this, LITERAL_TYPE_STRING); - while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak()) + // for source location comments we allow multiline string literals + while (m_char != quote && !isSourcePastEndOfInput() && (!isUnicodeLinebreak() || m_kind == ScannerKind::SourceLocationComment)) { char c = m_char; advance(); - if (c == '\\') + + if (m_kind == ScannerKind::SourceLocationComment) { - if (isSourcePastEndOfInput() || !scanEscape()) - return setError(ScannerError::IllegalEscapeSequence); + if (c == '\\') + scanEscape(); + else + addLiteralChar(c); } else { - // Report error on non-printable characters in string literals, however - // allow anything for unicode string literals, because their validity will - // be verified later (in the syntax checker). - // - // We are using a manual range and not isprint() to avoid - // any potential complications with locale. - if (!_isUnicode && (static_cast(c) <= 0x1f || static_cast(c) >= 0x7f)) + if (c == '\\') { - if (m_kind == ScannerKind::Yul) - return setError(ScannerError::IllegalCharacterInString); - return setError(ScannerError::UnicodeCharacterInNonUnicodeString); + if (isSourcePastEndOfInput() || !scanEscape()) + return setError(ScannerError::IllegalEscapeSequence); + } + else + { + // Report error on non-printable characters in string literals, however + // allow anything for unicode string literals, because their validity will + // be verified later (in the syntax checker). + // + // We are using a manual range and not isprint() to avoid + // any potential complications with locale. + if (!_isUnicode && (static_cast(c) <= 0x1f || static_cast(c) >= 0x7f)) + { + if (m_kind == ScannerKind::Yul) + return setError(ScannerError::IllegalCharacterInString); + return setError(ScannerError::UnicodeCharacterInNonUnicodeString); + } + addLiteralChar(c); } - addLiteralChar(c); } + } if (m_char != quote) return setError(ScannerError::IllegalStringEndQuote); @@ -1023,6 +1036,8 @@ std::tuple Scanner::scanIdentifierOrKeyword() auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal); switch (m_kind) { + case ScannerKind::SourceLocationComment: + [[fallthrough]]; case ScannerKind::Solidity: // Turn experimental Solidity keywords that are not keywords in legacy Solidity into identifiers. if (TokenTraits::isExperimentalSolidityOnlyKeyword(std::get<0>(token))) diff --git a/liblangutil/Scanner.h b/liblangutil/Scanner.h index eaa2b3b54d1e..3bb95cbdad4f 100644 --- a/liblangutil/Scanner.h +++ b/liblangutil/Scanner.h @@ -62,15 +62,12 @@ namespace solidity::langutil { -class AstRawString; -class AstValueFactory; -class ParserRecorder; - enum class ScannerKind { Solidity, Yul, - ExperimentalSolidity + ExperimentalSolidity, + SourceLocationComment }; enum class ScannerError diff --git a/libyul/AsmParser.cpp b/libyul/AsmParser.cpp index a009b005d6bd..e00201162747 100644 --- a/libyul/AsmParser.cpp +++ b/libyul/AsmParser.cpp @@ -27,11 +27,10 @@ #include #include #include +#include #include #include -#include - #include #include @@ -52,7 +51,7 @@ std::optional toInt(std::string const& _value) { return stoi(_value); } - catch (...) + catch (std::out_of_range const&) { return std::nullopt; } @@ -192,13 +191,40 @@ std::optional> Parser::parseSrcComme langutil::SourceLocation const& _commentLocation ) { - static std::regex const argsRegex = std::regex( - R"~~(^(-1|\d+):(-1|\d+):(-1|\d+)(?:\s+|$))~~" // index and location, e.g.: 1:234:-1 - R"~~(("(?:[^"\\]|\\.)*"?)?)~~", // optional code snippet, e.g.: "string memory s = \"abc\";..." - std::regex_constants::ECMAScript | std::regex_constants::optimize - ); - std::match_results match; - if (!regex_search(_arguments.cbegin(), _arguments.cend(), match, argsRegex)) + CharStream argumentStream (std::string(_arguments), ""); + Scanner scanner (argumentStream); + scanner.setScannerMode(ScannerKind::SourceLocationComment); + + std::string_view tail { _arguments.substr(_arguments.size()) }; + auto const parseIndex = [](Scanner& _scanner, bool expectTrailingColon) -> std::optional + { + bool negative = false; + if (_scanner.currentToken() == Token::Sub) + { + negative = true; + _scanner.next(); + } + if (_scanner.currentToken() != Token::Number) + return std::nullopt; + if (expectTrailingColon && _scanner.peekNextToken() != Token::Colon) + return std::nullopt; + if (!isValidDecimal(_scanner.currentLiteral())) + return std::nullopt; + std::string decimal = (negative ? "-" : "") + _scanner.currentLiteral(); + _scanner.next(); + if (expectTrailingColon) + _scanner.next(); + return decimal; + }; + std::optional rawSourceIndex = parseIndex(scanner, true); + std::optional rawStart = parseIndex(scanner, true); + std::optional rawEnd = parseIndex(scanner, false); + + bool const successfullyScannedIndices = rawSourceIndex && rawStart && rawEnd; + size_t const startIndex = static_cast(scanner.currentLocation().start); + bool success = successfullyScannedIndices && (scanner.peekNextToken() == Token::EOS || (startIndex > 0 && langutil::isWhiteSpace(_arguments[startIndex - 1]))); + + if (!success) { m_errorReporter.syntaxError( 8387_error, @@ -208,13 +234,22 @@ std::optional> Parser::parseSrcComme return std::nullopt; } - solAssert(match.size() == 5, ""); - std::string_view tail = _arguments.substr(static_cast(match.position() + match.length())); + if (scanner.currentToken() == Token::StringLiteral || (scanner.currentToken() == Token::Illegal && scanner.currentError() == ScannerError::IllegalStringEndQuote)) + tail = _arguments.substr(static_cast(scanner.currentLocation().end)); + else + tail = _arguments.substr(static_cast(scanner.currentLocation().start)); + + if (scanner.currentToken() == Token::HexStringLiteral || scanner.currentToken() == Token::UnicodeStringLiteral) + { + m_errorReporter.syntaxError( + 1543_error, + _commentLocation, + "Invalid code snippet in source location mapping. Hex and unicode strings not allowed." + ); + return {{tail, SourceLocation{}}}; + } - if (match[4].matched && ( - !boost::algorithm::ends_with(match[4].str(), "\"") || - boost::algorithm::ends_with(match[4].str(), "\\\"") - )) + if (scanner.currentToken() == Token::Illegal && scanner.currentError() == ScannerError::IllegalStringEndQuote) { m_errorReporter.syntaxError( 1544_error, @@ -224,11 +259,27 @@ std::optional> Parser::parseSrcComme return {{tail, SourceLocation{}}}; } - std::optional const sourceIndex = toInt(match[1].str()); - std::optional const start = toInt(match[2].str()); - std::optional const end = toInt(match[3].str()); + int sourceIndex = -1; + int start = -1; + int end = -1; + bool const successfullyInterpretedIndices = [&]() + { + if (successfullyScannedIndices) + { + auto const indexPairs = { std::tie(rawSourceIndex, sourceIndex), std::tie(rawStart, start), std::tie(rawEnd, end) }; + for (auto& [rawIndex, interpretedIndex] : indexPairs) + { + auto const interpreted = toInt(*rawIndex); + if (interpreted) + interpretedIndex = *interpreted; + else + return false; + } + } + return successfullyScannedIndices; + }(); - if (!sourceIndex.has_value() || !start.has_value() || !end.has_value()) + if (!successfullyInterpretedIndices || sourceIndex < -1 || start < -1 || end < -1) m_errorReporter.syntaxError( 6367_error, _commentLocation, @@ -236,8 +287,8 @@ std::optional> Parser::parseSrcComme "Expected non-negative integer values or -1 for source index and location." ); else if (sourceIndex == -1) - return {{tail, SourceLocation{start.value(), end.value(), nullptr}}}; - else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast(sourceIndex.value())))) + return {{tail, SourceLocation{start, end, nullptr}}}; + else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast(sourceIndex)))) m_errorReporter.syntaxError( 2674_error, _commentLocation, @@ -245,9 +296,8 @@ std::optional> Parser::parseSrcComme ); else { - std::shared_ptr sourceName = m_sourceNames->at(static_cast(sourceIndex.value())); - solAssert(sourceName, ""); - return {{tail, SourceLocation{start.value(), end.value(), std::move(sourceName)}}}; + std::shared_ptr sourceName = m_sourceNames->at(static_cast(sourceIndex)); + return {{tail, SourceLocation{start, end, std::move(sourceName)}}}; } return {{tail, SourceLocation{}}}; } diff --git a/scripts/error_codes.py b/scripts/error_codes.py index 6171714bd7ca..14c2f0231f58 100755 --- a/scripts/error_codes.py +++ b/scripts/error_codes.py @@ -192,6 +192,7 @@ def examine_id_coverage(top_dir, source_id_to_file_names, new_ids_only=False): # white list of ids which are not covered by tests white_ids = { "9804", # Tested in test/libyul/ObjectParser.cpp. + "1543", "1544", "1749", "2674", diff --git a/test/libyul/Parser.cpp b/test/libyul/Parser.cpp index 46c4f11a8c00..cc9a96dca4a8 100644 --- a/test/libyul/Parser.cpp +++ b/test/libyul/Parser.cpp @@ -767,6 +767,64 @@ BOOST_AUTO_TEST_CASE(customSourceLocations_two_locations_with_snippets_untermina CHECK_LOCATION(result->debugData->originLocation, "", -1, -1); } +BOOST_AUTO_TEST_CASE(customSourceLocations_single_quote) +{ + ErrorList errorList; + ErrorReporter reporter(errorList); + auto const sourceText = R"( + /// @src 0:111:222 " + /// + {} + )"; + EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{}); + std::shared_ptr result = parse(sourceText, dialect, reporter); + BOOST_REQUIRE(!!result); + BOOST_REQUIRE(errorList.size() == 1); + BOOST_TEST(errorList[0]->type() == Error::Type::SyntaxError); + BOOST_TEST(errorList[0]->errorId() == 1544_error); + CHECK_LOCATION(result->debugData->originLocation, "", -1, -1); +} + +BOOST_AUTO_TEST_CASE(customSourceLocations_hex_snippet_disallowed) +{ + ErrorList errorList; + ErrorReporter reporter(errorList); + auto const sourceText = R"( + /// @src 3:111:222 hex"ab12" + /// @src 2:111:222 unicode"ab12" + /// @src 1:123:321 'allowed' + /// @src 0:321:123 "allowed" + {} + )"; + EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{}); + std::shared_ptr result = parse(sourceText, dialect, reporter); + BOOST_REQUIRE(!!result); + BOOST_REQUIRE(errorList.size() == 2); + BOOST_TEST(errorList[0]->type() == Error::Type::SyntaxError); + BOOST_TEST(errorList[0]->errorId() == 1543_error); + BOOST_TEST(errorList[1]->type() == Error::Type::SyntaxError); + BOOST_TEST(errorList[1]->errorId() == 1543_error); + CHECK_LOCATION(result->debugData->originLocation, "source0", 321, 123); +} + +BOOST_AUTO_TEST_CASE(customSourceLocations_multi_line_source_loc) +{ + ErrorList errorList; + ErrorReporter reporter(errorList); + auto const sourceText = R"( + /// @src 1 : 111: + /// 222 " + /// abc\"def + /// + /// " @src 0:333:444 + {} + )"; + EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{}); + std::shared_ptr result = parse(sourceText, dialect, reporter); + BOOST_REQUIRE(!!result && errorList.empty()); + CHECK_LOCATION(result->debugData->originLocation, "source0", 333, 444); +} + BOOST_AUTO_TEST_CASE(customSourceLocations_with_code_snippets_with_nested_locations) { ErrorList errorList;