Skip to content

Commit

Permalink
Merge pull request #15209 from clonker/asm_parser_use_scanner
Browse files Browse the repository at this point in the history
AsmParser: parse source comment using scanner instead of regex
  • Loading branch information
clonker committed Jul 1, 2024
2 parents abf88cd + 8b0bb61 commit 8d788fb
Show file tree
Hide file tree
Showing 7 changed files with 244 additions and 48 deletions.
3 changes: 0 additions & 3 deletions .circleci/parallel_bytecode_report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ cd test-cases/
echo "Preparing input files"
python3 ../scripts/isolate_tests.py ../test/

# FIXME: These cases crash because of https://github.com/ethereum/solidity/issues/13583
rm ./*_bytecode_too_large_*.sol ./*_combined_too_large_*.sol

if [[ $binary_type == native || $binary_type == "osx_intel" ]]; then
interface=$(echo -e "standard-json\ncli" | circleci tests split)
echo "Selected interface: ${interface}"
Expand Down
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Language Features:
* Accept declarations of state variables with ``transient`` data location (parser support only, no code generation yet).
* Make ``require(bool, Error)`` available when using the legacy pipeline.
* Yul: Parsing rules for source location comments have been relaxed: Whitespace between the location components as well as single-quoted code snippets are now allowed.


Compiler Features:
Expand All @@ -19,6 +20,7 @@ Bugfixes:
* SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine.
* TypeChecker: Fix segfault when assigning nested tuple to tuple.
* Yul Optimizer: Name simplification could lead to forbidden identifiers with a leading and/or trailing dot, e.g., ``x._`` would get simplified into ``x.``.
* Yul Parser: Fix segfault when parsing very long location comments.


### 0.8.26 (2024-05-21)
Expand Down
50 changes: 35 additions & 15 deletions liblangutil/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,31 +826,48 @@ Token Scanner::scanString(bool const _isUnicode)
char const quote = m_char;
advance(); // consume quote
LiteralScope literal(this, LITERAL_TYPE_STRING);
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
// for source location comments we allow multiline string literals
while (m_char != quote && !isSourcePastEndOfInput() && (!isUnicodeLinebreak() || m_kind == ScannerKind::SpecialComment))
{
char c = m_char;
advance();
if (c == '\\')

if (m_kind == ScannerKind::SpecialComment)
{
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
if (c == '\\')
{
if (isSourcePastEndOfInput())
return setError(ScannerError::IllegalEscapeSequence);
advance();
}
else
addLiteralChar(c);
}
else
{
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
if (c == '\\')
{
if (m_kind == ScannerKind::Yul)
return setError(ScannerError::IllegalCharacterInString);
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
}
else
{
// Report error on non-printable characters in string literals, however
// allow anything for unicode string literals, because their validity will
// be verified later (in the syntax checker).
//
// We are using a manual range and not isprint() to avoid
// any potential complications with locale.
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
{
if (m_kind == ScannerKind::Yul)
return setError(ScannerError::IllegalCharacterInString);
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
}
addLiteralChar(c);
}
addLiteralChar(c);
}

}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
Expand Down Expand Up @@ -1023,6 +1040,9 @@ std::tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
switch (m_kind)
{
case ScannerKind::SpecialComment:
// there are no keywords in special comments
return std::make_tuple(Token::Identifier, 0, 0);
case ScannerKind::Solidity:
// Turn experimental Solidity keywords that are not keywords in legacy Solidity into identifiers.
if (TokenTraits::isExperimentalSolidityOnlyKeyword(std::get<0>(token)))
Expand Down
11 changes: 5 additions & 6 deletions liblangutil/Scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,12 @@
namespace solidity::langutil
{

class AstRawString;
class AstValueFactory;
class ParserRecorder;

enum class ScannerKind
{
Solidity,
Yul,
ExperimentalSolidity
ExperimentalSolidity,
SpecialComment
};

enum class ScannerError
Expand Down Expand Up @@ -102,11 +99,13 @@ class Scanner
{
friend class LiteralScope;
public:
explicit Scanner(CharStream& _source):
explicit Scanner(CharStream& _source, ScannerKind _kind = ScannerKind::Solidity):
m_source(_source),
m_sourceName{std::make_shared<std::string>(_source.name())}
{
reset();
if (_kind != ScannerKind::Solidity)
setScannerMode(_kind);
}

/// Resets scanner to the start of input.
Expand Down
82 changes: 58 additions & 24 deletions libyul/AsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,10 @@
#include <liblangutil/ErrorReporter.h>
#include <liblangutil/Exceptions.h>
#include <liblangutil/Scanner.h>
#include <liblangutil/Common.h>
#include <libsolutil/Common.h>
#include <libsolutil/Visitor.h>

#include <range/v3/view/subrange.hpp>

#include <boost/algorithm/string.hpp>

#include <algorithm>
Expand All @@ -53,7 +52,7 @@ std::optional<int> toInt(std::string const& _value)
{
return stoi(_value);
}
catch (...)
catch (std::out_of_range const&)
{
return std::nullopt;
}
Expand Down Expand Up @@ -193,13 +192,41 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
langutil::SourceLocation const& _commentLocation
)
{
static std::regex const argsRegex = std::regex(
R"~~(^(-1|\d+):(-1|\d+):(-1|\d+)(?:\s+|$))~~" // index and location, e.g.: 1:234:-1
R"~~(("(?:[^"\\]|\\.)*"?)?)~~", // optional code snippet, e.g.: "string memory s = \"abc\";..."
std::regex_constants::ECMAScript | std::regex_constants::optimize
);
std::match_results<std::string_view::const_iterator> match;
if (!regex_search(_arguments.cbegin(), _arguments.cend(), match, argsRegex))
CharStream argumentStream(std::string(_arguments), "");
Scanner scanner(argumentStream, ScannerKind::SpecialComment);

std::string_view tail{_arguments.substr(_arguments.size())};
auto const parseLocationComponent = [](Scanner& _scanner, bool expectTrailingColon) -> std::optional<std::string>
{
bool negative = false;
if (_scanner.currentToken() == Token::Sub)
{
negative = true;
_scanner.next();
}
if (_scanner.currentToken() != Token::Number)
return std::nullopt;
if (expectTrailingColon && _scanner.peekNextToken() != Token::Colon)
return std::nullopt;
if (!isValidDecimal(_scanner.currentLiteral()))
return std::nullopt;
std::string decimal = (negative ? "-" : "") + _scanner.currentLiteral();
_scanner.next();
if (expectTrailingColon)
_scanner.next();
return decimal;
};
std::optional<std::string> rawSourceIndex = parseLocationComponent(scanner, true);
std::optional<std::string> rawStart = parseLocationComponent(scanner, true);
std::optional<std::string> rawEnd = parseLocationComponent(scanner, false);

size_t const snippetStart = static_cast<size_t>(scanner.currentLocation().start);
bool const locationScannedSuccessfully = rawSourceIndex && rawStart && rawEnd;
bool const locationIsWhitespaceSeparated =
scanner.peekNextToken() == Token::EOS ||
(snippetStart > 0 && langutil::isWhiteSpace(_arguments[snippetStart - 1]));

if (!locationScannedSuccessfully || !locationIsWhitespaceSeparated)
{
m_errorReporter.syntaxError(
8387_error,
Expand All @@ -209,13 +236,16 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
return std::nullopt;
}

solAssert(match.size() == 5, "");
std::string_view tail = _arguments.substr(static_cast<size_t>(match.position() + match.length()));
// captures error cases `"test` (illegal end quote) and `"test\` (illegal escape sequence / dangling backslash)
bool const illegalLiteral = scanner.currentToken() == Token::Illegal && (scanner.currentError() == ScannerError::IllegalStringEndQuote || scanner.currentError() == ScannerError::IllegalEscapeSequence);
if (scanner.currentToken() == Token::StringLiteral || illegalLiteral)
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().end));
else
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().start));

if (match[4].matched && (
!boost::algorithm::ends_with(match[4].str(), "\"") ||
boost::algorithm::ends_with(match[4].str(), "\\\"")
))
// Other scanner errors may occur if there is no string literal which follows
// (f.ex. IllegalHexDigit, IllegalCommentTerminator), but these are ignored
if (illegalLiteral)
{
m_errorReporter.syntaxError(
1544_error,
Expand All @@ -225,30 +255,34 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
return {{tail, SourceLocation{}}};
}

std::optional<int> const sourceIndex = toInt(match[1].str());
std::optional<int> const start = toInt(match[2].str());
std::optional<int> const end = toInt(match[3].str());
std::optional<int> const sourceIndex = toInt(*rawSourceIndex);
std::optional<int> const start = toInt(*rawStart);
std::optional<int> const end = toInt(*rawEnd);

if (!sourceIndex.has_value() || !start.has_value() || !end.has_value())
if (
!sourceIndex.has_value() || *sourceIndex < -1 ||
!start.has_value() || *start < -1 ||
!end.has_value() || *end < -1
)
m_errorReporter.syntaxError(
6367_error,
_commentLocation,
"Invalid value in source location mapping. "
"Expected non-negative integer values or -1 for source index and location."
);
else if (sourceIndex == -1)
return {{tail, SourceLocation{start.value(), end.value(), nullptr}}};
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(sourceIndex.value()))))
return {{tail, SourceLocation{*start, *end, nullptr}}};
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(*sourceIndex))))
m_errorReporter.syntaxError(
2674_error,
_commentLocation,
"Invalid source mapping. Source index not defined via @use-src."
);
else
{
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(sourceIndex.value()));
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(*sourceIndex));
solAssert(sourceName, "");
return {{tail, SourceLocation{start.value(), end.value(), std::move(sourceName)}}};
return {{tail, SourceLocation{*start, *end, std::move(sourceName)}}};
}
return {{tail, SourceLocation{}}};
}
Expand Down
46 changes: 46 additions & 0 deletions test/liblangutil/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,52 @@ BOOST_AUTO_TEST_CASE(yul_function_with_whitespace)
BOOST_CHECK_EQUAL(scanner.next(), Token::EOS);
}

BOOST_AUTO_TEST_CASE(special_comment_with_invalid_escapes)
{
std::string input(R"("test\x\f\g\u\g\a\u\g\a12\uö\xyoof")");
std::string expectedOutput(R"(test12öyoof)");
CharStream stream(input, "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_valid_and_invalid_escapes)
{
std::string input(R"("test\n\x61\t\u01A9test\f")");
std::string expectedOutput(R"(test6101A9test)");
CharStream stream(input, "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_escape_sequence_at_eos)
{
CharStream stream(R"("test\)", "");
std::string expectedOutput(R"(test6101A9test)");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalEscapeSequence);
}

BOOST_AUTO_TEST_CASE(special_comment_with_escaped_quotes)
{
CharStream stream(R"("test\\\"")", "");
std::string expectedOutput(R"(test)");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
}

BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
{
CharStream stream(R"("test)", "");
Scanner scanner(stream, ScannerKind::SpecialComment);
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalStringEndQuote);
}

BOOST_AUTO_TEST_SUITE_END()

} // end namespaces
Loading

0 comments on commit 8d788fb

Please sign in to comment.