Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AsmParser: parse source comment using scanner instead of regex #15209

Merged
merged 3 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .circleci/parallel_bytecode_report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ cd test-cases/
echo "Preparing input files"
python3 ../scripts/isolate_tests.py ../test/

# FIXME: These cases crash because of https://github.com/ethereum/solidity/issues/13583
rm ./*_bytecode_too_large_*.sol ./*_combined_too_large_*.sol

if [[ $binary_type == native || $binary_type == "osx_intel" ]]; then
interface=$(echo -e "standard-json\ncli" | circleci tests split)
echo "Selected interface: ${interface}"
Expand Down
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Language Features:
* Accept declarations of state variables with ``transient`` data location (parser support only, no code generation yet).
* Make ``require(bool, Error)`` available when using the legacy pipeline.
* Yul: Parsing rules for source location comments have been relaxed: Whitespace between the location components as well as single-quoted code snippets are now allowed.


Compiler Features:
Expand All @@ -18,6 +19,7 @@ Bugfixes:
* SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine.
* TypeChecker: Fix segfault when assigning nested tuple to tuple.
* Yul Optimizer: Name simplification could lead to forbidden identifiers with a leading and/or trailing dot, e.g., ``x._`` would get simplified into ``x.``.
* Yul Parser: Fix segfault when parsing very long location comments.


### 0.8.26 (2024-05-21)
Expand Down
50 changes: 35 additions & 15 deletions liblangutil/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,31 +826,48 @@ Token Scanner::scanString(bool const _isUnicode)
char const quote = m_char;
advance(); // consume quote
LiteralScope literal(this, LITERAL_TYPE_STRING);
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
// for source location comments we allow multiline string literals
while (m_char != quote && !isSourcePastEndOfInput() && (!isUnicodeLinebreak() || m_kind == ScannerKind::SpecialComment))
{
char c = m_char;
advance();
if (c == '\\')

if (m_kind == ScannerKind::SpecialComment)
{
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
if (c == '\\')
{
if (isSourcePastEndOfInput())
return setError(ScannerError::IllegalEscapeSequence);
advance();
}
else
addLiteralChar(c);
}
else
{
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
if (c == '\\')
{
if (m_kind == ScannerKind::Yul)
return setError(ScannerError::IllegalCharacterInString);
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
}
else
{
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
{
if (m_kind == ScannerKind::Yul)
return setError(ScannerError::IllegalCharacterInString);
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
}
addLiteralChar(c);
}
addLiteralChar(c);
}

}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
Expand Down Expand Up @@ -1023,6 +1040,9 @@ std::tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
switch (m_kind)
{
case ScannerKind::SpecialComment:
// there are no keywords in special comments
return std::make_tuple(Token::Identifier, 0, 0);
case ScannerKind::Solidity:
// Turn experimental Solidity keywords that are not keywords in legacy Solidity into identifiers.
if (TokenTraits::isExperimentalSolidityOnlyKeyword(std::get<0>(token)))
Expand Down
11 changes: 5 additions & 6 deletions liblangutil/Scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,12 @@
namespace solidity::langutil
{

class AstRawString;
class AstValueFactory;
class ParserRecorder;

enum class ScannerKind
{
Solidity,
Yul,
ExperimentalSolidity
ExperimentalSolidity,
SpecialComment
};

enum class ScannerError
Expand Down Expand Up @@ -102,11 +99,13 @@ class Scanner
{
friend class LiteralScope;
public:
explicit Scanner(CharStream& _source):
explicit Scanner(CharStream& _source, ScannerKind _kind = ScannerKind::Solidity):
m_source(_source),
m_sourceName{std::make_shared<std::string>(_source.name())}
{
reset();
if (_kind != ScannerKind::Solidity)
setScannerMode(_kind);
}

/// Resets scanner to the start of input.
Expand Down
82 changes: 58 additions & 24 deletions libyul/AsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,10 @@
#include <liblangutil/ErrorReporter.h>
#include <liblangutil/Exceptions.h>
#include <liblangutil/Scanner.h>
#include <liblangutil/Common.h>
#include <libsolutil/Common.h>
#include <libsolutil/Visitor.h>

#include <range/v3/view/subrange.hpp>

#include <boost/algorithm/string.hpp>

#include <algorithm>
Expand All @@ -53,7 +52,7 @@ std::optional<int> toInt(std::string const& _value)
{
return stoi(_value);
}
catch (...)
catch (std::out_of_range const&)
{
return std::nullopt;
}
Expand Down Expand Up @@ -193,13 +192,41 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
langutil::SourceLocation const& _commentLocation
)
{
static std::regex const argsRegex = std::regex(
R"~~(^(-1|\d+):(-1|\d+):(-1|\d+)(?:\s+|$))~~" // index and location, e.g.: 1:234:-1
R"~~(("(?:[^"\\]|\\.)*"?)?)~~", // optional code snippet, e.g.: "string memory s = \"abc\";..."
std::regex_constants::ECMAScript | std::regex_constants::optimize
);
std::match_results<std::string_view::const_iterator> match;
if (!regex_search(_arguments.cbegin(), _arguments.cend(), match, argsRegex))
CharStream argumentStream(std::string(_arguments), "");
Scanner scanner(argumentStream, ScannerKind::SpecialComment);

std::string_view tail{_arguments.substr(_arguments.size())};
auto const parseLocationComponent = [](Scanner& _scanner, bool expectTrailingColon) -> std::optional<std::string>
{
bool negative = false;
if (_scanner.currentToken() == Token::Sub)
{
negative = true;
_scanner.next();
}
if (_scanner.currentToken() != Token::Number)
return std::nullopt;
if (expectTrailingColon && _scanner.peekNextToken() != Token::Colon)
return std::nullopt;
if (!isValidDecimal(_scanner.currentLiteral()))
return std::nullopt;
std::string decimal = (negative ? "-" : "") + _scanner.currentLiteral();
_scanner.next();
if (expectTrailingColon)
_scanner.next();
return decimal;
};
std::optional<std::string> rawSourceIndex = parseLocationComponent(scanner, true);
std::optional<std::string> rawStart = parseLocationComponent(scanner, true);
std::optional<std::string> rawEnd = parseLocationComponent(scanner, false);

size_t const snippetStart = static_cast<size_t>(scanner.currentLocation().start);
bool const locationScannedSuccessfully = rawSourceIndex && rawStart && rawEnd;
bool const locationIsWhitespaceSeparated =
scanner.peekNextToken() == Token::EOS ||
(snippetStart > 0 && langutil::isWhiteSpace(_arguments[snippetStart - 1]));

if (!locationScannedSuccessfully || !locationIsWhitespaceSeparated)
{
m_errorReporter.syntaxError(
8387_error,
Expand All @@ -209,13 +236,16 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
return std::nullopt;
}

solAssert(match.size() == 5, "");
std::string_view tail = _arguments.substr(static_cast<size_t>(match.position() + match.length()));
// captures error cases `"test` (illegal end quote) and `"test\` (illegal escape sequence / dangling backslash)
bool const illegalLiteral = scanner.currentToken() == Token::Illegal && (scanner.currentError() == ScannerError::IllegalStringEndQuote || scanner.currentError() == ScannerError::IllegalEscapeSequence);
if (scanner.currentToken() == Token::StringLiteral || illegalLiteral)
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().end));
else
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().start));

if (match[4].matched && (
!boost::algorithm::ends_with(match[4].str(), "\"") ||
boost::algorithm::ends_with(match[4].str(), "\\\"")
))
// Other scanner errors may occur if there is no string literal which follows
// (f.ex. IllegalHexDigit, IllegalCommentTerminator), but these are ignored
if (illegalLiteral)
{
m_errorReporter.syntaxError(
1544_error,
Expand All @@ -225,30 +255,34 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
return {{tail, SourceLocation{}}};
}

std::optional<int> const sourceIndex = toInt(match[1].str());
std::optional<int> const start = toInt(match[2].str());
std::optional<int> const end = toInt(match[3].str());
std::optional<int> const sourceIndex = toInt(*rawSourceIndex);
std::optional<int> const start = toInt(*rawStart);
std::optional<int> const end = toInt(*rawEnd);

if (!sourceIndex.has_value() || !start.has_value() || !end.has_value())
if (
!sourceIndex.has_value() || *sourceIndex < -1 ||
!start.has_value() || *start < -1 ||
!end.has_value() || *end < -1
)
m_errorReporter.syntaxError(
6367_error,
_commentLocation,
"Invalid value in source location mapping. "
"Expected non-negative integer values or -1 for source index and location."
);
else if (sourceIndex == -1)
return {{tail, SourceLocation{start.value(), end.value(), nullptr}}};
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(sourceIndex.value()))))
return {{tail, SourceLocation{*start, *end, nullptr}}};
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(*sourceIndex))))
m_errorReporter.syntaxError(
2674_error,
_commentLocation,
"Invalid source mapping. Source index not defined via @use-src."
);
else
{
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(sourceIndex.value()));
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(*sourceIndex));
solAssert(sourceName, "");
clonker marked this conversation as resolved.
Show resolved Hide resolved
return {{tail, SourceLocation{start.value(), end.value(), std::move(sourceName)}}};
return {{tail, SourceLocation{*start, *end, std::move(sourceName)}}};
}
return {{tail, SourceLocation{}}};
}
Expand Down
46 changes: 46 additions & 0 deletions test/liblangutil/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,52 @@ BOOST_AUTO_TEST_CASE(yul_function_with_whitespace)
BOOST_CHECK_EQUAL(scanner.next(), Token::EOS);
}

BOOST_AUTO_TEST_CASE(special_comment_with_invalid_escapes)
{
std::string input(R"("test\x\f\g\u\g\a\u\g\a12\uö\xyoof")");
std::string expectedOutput(R"(test12öyoof)");
CharStream stream(input, "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_valid_and_invalid_escapes)
{
std::string input(R"("test\n\x61\t\u01A9test\f")");
std::string expectedOutput(R"(test6101A9test)");
CharStream stream(input, "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_escape_sequence_at_eos)
{
CharStream stream(R"("test\)", "");
std::string expectedOutput(R"(test6101A9test)");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalEscapeSequence);
}

BOOST_AUTO_TEST_CASE(special_comment_with_escaped_quotes)
{
CharStream stream(R"("test\\\"")", "");
std::string expectedOutput(R"(test)");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
{
CharStream stream(R"("test)", "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalStringEndQuote);
}

BOOST_AUTO_TEST_SUITE_END()

} // end namespaces
Loading