Skip to content

Commit 8d788fb

Browse files
authored
Merge pull request #15209 from clonker/asm_parser_use_scanner
AsmParser: parse source comment using scanner instead of regex
2 parents abf88cd + 8b0bb61 commit 8d788fb

File tree

7 files changed

+244
-48
lines changed

7 files changed

+244
-48
lines changed

.circleci/parallel_bytecode_report.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@ cd test-cases/
4444
echo "Preparing input files"
4545
python3 ../scripts/isolate_tests.py ../test/
4646

47-
# FIXME: These cases crash because of https://github.com/ethereum/solidity/issues/13583
48-
rm ./*_bytecode_too_large_*.sol ./*_combined_too_large_*.sol
49-
5047
if [[ $binary_type == native || $binary_type == "osx_intel" ]]; then
5148
interface=$(echo -e "standard-json\ncli" | circleci tests split)
5249
echo "Selected interface: ${interface}"

Changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Language Features:
44
* Accept declarations of state variables with ``transient`` data location (parser support only, no code generation yet).
55
* Make ``require(bool, Error)`` available when using the legacy pipeline.
6+
* Yul: Parsing rules for source location comments have been relaxed: Whitespace between the location components as well as single-quoted code snippets are now allowed.
67

78

89
Compiler Features:
@@ -19,6 +20,7 @@ Bugfixes:
1920
* SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine.
2021
* TypeChecker: Fix segfault when assigning nested tuple to tuple.
2122
* Yul Optimizer: Name simplification could lead to forbidden identifiers with a leading and/or trailing dot, e.g., ``x._`` would get simplified into ``x.``.
23+
* Yul Parser: Fix segfault when parsing very long location comments.
2224

2325

2426
### 0.8.26 (2024-05-21)

liblangutil/Scanner.cpp

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -826,31 +826,48 @@ Token Scanner::scanString(bool const _isUnicode)
826826
char const quote = m_char;
827827
advance(); // consume quote
828828
LiteralScope literal(this, LITERAL_TYPE_STRING);
829-
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
829+
// for source location comments we allow multiline string literals
830+
while (m_char != quote && !isSourcePastEndOfInput() && (!isUnicodeLinebreak() || m_kind == ScannerKind::SpecialComment))
830831
{
831832
char c = m_char;
832833
advance();
833-
if (c == '\\')
834+
835+
if (m_kind == ScannerKind::SpecialComment)
834836
{
835-
if (isSourcePastEndOfInput() || !scanEscape())
836-
return setError(ScannerError::IllegalEscapeSequence);
837+
if (c == '\\')
838+
{
839+
if (isSourcePastEndOfInput())
840+
return setError(ScannerError::IllegalEscapeSequence);
841+
advance();
842+
}
843+
else
844+
addLiteralChar(c);
837845
}
838846
else
839847
{
840-
// Report error on non-printable characters in string literals, however
841-
// allow anything for unicode string literals, because their validity will
842-
// be verified later (in the syntax checker).
843-
//
844-
// We are using a manual range and not isprint() to avoid
845-
// any potential complications with locale.
846-
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
848+
if (c == '\\')
847849
{
848-
if (m_kind == ScannerKind::Yul)
849-
return setError(ScannerError::IllegalCharacterInString);
850-
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
850+
if (isSourcePastEndOfInput() || !scanEscape())
851+
return setError(ScannerError::IllegalEscapeSequence);
852+
}
853+
else
854+
{
855+
// Report error on non-printable characters in string literals, however
856+
// allow anything for unicode string literals, because their validity will
857+
// be verified later (in the syntax checker).
858+
//
859+
// We are using a manual range and not isprint() to avoid
860+
// any potential complications with locale.
861+
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
862+
{
863+
if (m_kind == ScannerKind::Yul)
864+
return setError(ScannerError::IllegalCharacterInString);
865+
return setError(ScannerError::UnicodeCharacterInNonUnicodeString);
866+
}
867+
addLiteralChar(c);
851868
}
852-
addLiteralChar(c);
853869
}
870+
854871
}
855872
if (m_char != quote)
856873
return setError(ScannerError::IllegalStringEndQuote);
@@ -1023,6 +1040,9 @@ std::tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
10231040
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
10241041
switch (m_kind)
10251042
{
1043+
case ScannerKind::SpecialComment:
1044+
// there are no keywords in special comments
1045+
return std::make_tuple(Token::Identifier, 0, 0);
10261046
case ScannerKind::Solidity:
10271047
// Turn experimental Solidity keywords that are not keywords in legacy Solidity into identifiers.
10281048
if (TokenTraits::isExperimentalSolidityOnlyKeyword(std::get<0>(token)))

liblangutil/Scanner.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,12 @@
6262
namespace solidity::langutil
6363
{
6464

65-
class AstRawString;
66-
class AstValueFactory;
67-
class ParserRecorder;
68-
6965
enum class ScannerKind
7066
{
7167
Solidity,
7268
Yul,
73-
ExperimentalSolidity
69+
ExperimentalSolidity,
70+
SpecialComment
7471
};
7572

7673
enum class ScannerError
@@ -102,11 +99,13 @@ class Scanner
10299
{
103100
friend class LiteralScope;
104101
public:
105-
explicit Scanner(CharStream& _source):
102+
explicit Scanner(CharStream& _source, ScannerKind _kind = ScannerKind::Solidity):
106103
m_source(_source),
107104
m_sourceName{std::make_shared<std::string>(_source.name())}
108105
{
109106
reset();
107+
if (_kind != ScannerKind::Solidity)
108+
setScannerMode(_kind);
110109
}
111110

112111
/// Resets scanner to the start of input.

libyul/AsmParser.cpp

Lines changed: 58 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,10 @@
2828
#include <liblangutil/ErrorReporter.h>
2929
#include <liblangutil/Exceptions.h>
3030
#include <liblangutil/Scanner.h>
31+
#include <liblangutil/Common.h>
3132
#include <libsolutil/Common.h>
3233
#include <libsolutil/Visitor.h>
3334

34-
#include <range/v3/view/subrange.hpp>
35-
3635
#include <boost/algorithm/string.hpp>
3736

3837
#include <algorithm>
@@ -53,7 +52,7 @@ std::optional<int> toInt(std::string const& _value)
5352
{
5453
return stoi(_value);
5554
}
56-
catch (...)
55+
catch (std::out_of_range const&)
5756
{
5857
return std::nullopt;
5958
}
@@ -193,13 +192,41 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
193192
langutil::SourceLocation const& _commentLocation
194193
)
195194
{
196-
static std::regex const argsRegex = std::regex(
197-
R"~~(^(-1|\d+):(-1|\d+):(-1|\d+)(?:\s+|$))~~" // index and location, e.g.: 1:234:-1
198-
R"~~(("(?:[^"\\]|\\.)*"?)?)~~", // optional code snippet, e.g.: "string memory s = \"abc\";..."
199-
std::regex_constants::ECMAScript | std::regex_constants::optimize
200-
);
201-
std::match_results<std::string_view::const_iterator> match;
202-
if (!regex_search(_arguments.cbegin(), _arguments.cend(), match, argsRegex))
195+
CharStream argumentStream(std::string(_arguments), "");
196+
Scanner scanner(argumentStream, ScannerKind::SpecialComment);
197+
198+
std::string_view tail{_arguments.substr(_arguments.size())};
199+
auto const parseLocationComponent = [](Scanner& _scanner, bool expectTrailingColon) -> std::optional<std::string>
200+
{
201+
bool negative = false;
202+
if (_scanner.currentToken() == Token::Sub)
203+
{
204+
negative = true;
205+
_scanner.next();
206+
}
207+
if (_scanner.currentToken() != Token::Number)
208+
return std::nullopt;
209+
if (expectTrailingColon && _scanner.peekNextToken() != Token::Colon)
210+
return std::nullopt;
211+
if (!isValidDecimal(_scanner.currentLiteral()))
212+
return std::nullopt;
213+
std::string decimal = (negative ? "-" : "") + _scanner.currentLiteral();
214+
_scanner.next();
215+
if (expectTrailingColon)
216+
_scanner.next();
217+
return decimal;
218+
};
219+
std::optional<std::string> rawSourceIndex = parseLocationComponent(scanner, true);
220+
std::optional<std::string> rawStart = parseLocationComponent(scanner, true);
221+
std::optional<std::string> rawEnd = parseLocationComponent(scanner, false);
222+
223+
size_t const snippetStart = static_cast<size_t>(scanner.currentLocation().start);
224+
bool const locationScannedSuccessfully = rawSourceIndex && rawStart && rawEnd;
225+
bool const locationIsWhitespaceSeparated =
226+
scanner.peekNextToken() == Token::EOS ||
227+
(snippetStart > 0 && langutil::isWhiteSpace(_arguments[snippetStart - 1]));
228+
229+
if (!locationScannedSuccessfully || !locationIsWhitespaceSeparated)
203230
{
204231
m_errorReporter.syntaxError(
205232
8387_error,
@@ -209,13 +236,16 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
209236
return std::nullopt;
210237
}
211238

212-
solAssert(match.size() == 5, "");
213-
std::string_view tail = _arguments.substr(static_cast<size_t>(match.position() + match.length()));
239+
// captures error cases `"test` (illegal end quote) and `"test\` (illegal escape sequence / dangling backslash)
240+
bool const illegalLiteral = scanner.currentToken() == Token::Illegal && (scanner.currentError() == ScannerError::IllegalStringEndQuote || scanner.currentError() == ScannerError::IllegalEscapeSequence);
241+
if (scanner.currentToken() == Token::StringLiteral || illegalLiteral)
242+
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().end));
243+
else
244+
tail = _arguments.substr(static_cast<size_t>(scanner.currentLocation().start));
214245

215-
if (match[4].matched && (
216-
!boost::algorithm::ends_with(match[4].str(), "\"") ||
217-
boost::algorithm::ends_with(match[4].str(), "\\\"")
218-
))
246+
// Other scanner errors may occur if there is no string literal which follows
247+
// (f.ex. IllegalHexDigit, IllegalCommentTerminator), but these are ignored
248+
if (illegalLiteral)
219249
{
220250
m_errorReporter.syntaxError(
221251
1544_error,
@@ -225,30 +255,34 @@ std::optional<std::pair<std::string_view, SourceLocation>> Parser::parseSrcComme
225255
return {{tail, SourceLocation{}}};
226256
}
227257

228-
std::optional<int> const sourceIndex = toInt(match[1].str());
229-
std::optional<int> const start = toInt(match[2].str());
230-
std::optional<int> const end = toInt(match[3].str());
258+
std::optional<int> const sourceIndex = toInt(*rawSourceIndex);
259+
std::optional<int> const start = toInt(*rawStart);
260+
std::optional<int> const end = toInt(*rawEnd);
231261

232-
if (!sourceIndex.has_value() || !start.has_value() || !end.has_value())
262+
if (
263+
!sourceIndex.has_value() || *sourceIndex < -1 ||
264+
!start.has_value() || *start < -1 ||
265+
!end.has_value() || *end < -1
266+
)
233267
m_errorReporter.syntaxError(
234268
6367_error,
235269
_commentLocation,
236270
"Invalid value in source location mapping. "
237271
"Expected non-negative integer values or -1 for source index and location."
238272
);
239273
else if (sourceIndex == -1)
240-
return {{tail, SourceLocation{start.value(), end.value(), nullptr}}};
241-
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(sourceIndex.value()))))
274+
return {{tail, SourceLocation{*start, *end, nullptr}}};
275+
else if (!(sourceIndex >= 0 && m_sourceNames->count(static_cast<unsigned>(*sourceIndex))))
242276
m_errorReporter.syntaxError(
243277
2674_error,
244278
_commentLocation,
245279
"Invalid source mapping. Source index not defined via @use-src."
246280
);
247281
else
248282
{
249-
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(sourceIndex.value()));
283+
std::shared_ptr<std::string const> sourceName = m_sourceNames->at(static_cast<unsigned>(*sourceIndex));
250284
solAssert(sourceName, "");
251-
return {{tail, SourceLocation{start.value(), end.value(), std::move(sourceName)}}};
285+
return {{tail, SourceLocation{*start, *end, std::move(sourceName)}}};
252286
}
253287
return {{tail, SourceLocation{}}};
254288
}

test/liblangutil/Scanner.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,52 @@ BOOST_AUTO_TEST_CASE(yul_function_with_whitespace)
10261026
BOOST_CHECK_EQUAL(scanner.next(), Token::EOS);
10271027
}
10281028

1029+
BOOST_AUTO_TEST_CASE(special_comment_with_invalid_escapes)
1030+
{
1031+
std::string input(R"("test\x\f\g\u\g\a\u\g\a12\uö\xyoof")");
1032+
std::string expectedOutput(R"(test12öyoof)");
1033+
CharStream stream(input, "");
1034+
Scanner scanner(stream, ScannerKind::SpecialComment);
1035+
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
1036+
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
1037+
}
1038+
1039+
BOOST_AUTO_TEST_CASE(special_comment_with_valid_and_invalid_escapes)
1040+
{
1041+
std::string input(R"("test\n\x61\t\u01A9test\f")");
1042+
std::string expectedOutput(R"(test6101A9test)");
1043+
CharStream stream(input, "");
1044+
Scanner scanner(stream, ScannerKind::SpecialComment);
1045+
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
1046+
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
1047+
}
1048+
1049+
BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_escape_sequence_at_eos)
1050+
{
1051+
CharStream stream(R"("test\)", "");
1052+
std::string expectedOutput(R"(test6101A9test)");
1053+
Scanner scanner(stream, ScannerKind::SpecialComment);
1054+
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
1055+
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalEscapeSequence);
1056+
}
1057+
1058+
BOOST_AUTO_TEST_CASE(special_comment_with_escaped_quotes)
1059+
{
1060+
CharStream stream(R"("test\\\"")", "");
1061+
std::string expectedOutput(R"(test)");
1062+
Scanner scanner(stream, ScannerKind::SpecialComment);
1063+
BOOST_REQUIRE(scanner.currentToken() == Token::StringLiteral);
1064+
BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
1065+
}
1066+
1067+
BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
1068+
{
1069+
CharStream stream(R"("test)", "");
1070+
Scanner scanner(stream, ScannerKind::SpecialComment);
1071+
BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
1072+
BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalStringEndQuote);
1073+
}
1074+
10291075
BOOST_AUTO_TEST_SUITE_END()
10301076

10311077
} // end namespaces

0 commit comments

Comments
 (0)