handle dangling backslashes in special comments and some cleanup

ethereum · Jul 1, 2024 · 1492eb5 · 1492eb5
1 parent a0cea0c
commit 1492eb5
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 15 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -3,7 +3,7 @@
 Language Features:
  * Accept declarations of state variables with ``transient`` data location (parser support only, no code generation yet).
  * Make ``require(bool, Error)`` available when using the legacy pipeline.
- * Yul: Parsing rules for source location comments have been relaxed: Whitespace between the indices as well as single-quoted code snippets are now allowed.
+ * Yul: Parsing rules for source location comments have been relaxed: Whitespace between the location components as well as single-quoted code snippets are now allowed.
 
 
 Compiler Features:
@@ -14,12 +14,12 @@ Compiler Features:
 
 
 Bugfixes:
- * AsmParser: Alleviates risk of encountering a segfault for very long comments.
  * SMTChecker: Fix error that reports invalid number of verified checks for BMC and CHC engines.
  * SMTChecker: Fix formatting of unary minus expressions in invariants.
  * SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine.
  * TypeChecker: Fix segfault when assigning nested tuple to tuple.
  * Yul Optimizer: Name simplification could lead to forbidden identifiers with a leading and/or trailing dot, e.g., ``x._`` would get simplified into ``x.``.
+ * Yul Parser: Fix segfault when parsing very long location comments.
 
 
 ### 0.8.26 (2024-05-21)

diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp
@@ -759,12 +759,13 @@ void Scanner::scanToken()
  m_tokens[NextNext].extendedTokenInfo = std::make_tuple(m, n);
 }
 
-bool Scanner::scanEscape()
+bool Scanner::scanEscape(bool const _rejectInvalidEscapes)
 {
  char c = m_char;
 
- // Skip escaped newlines.
- if (m_kind != ScannerKind::SpecialComment && tryScanEndOfLine())
+ // Normally we ignore the slash just before a newline since it's meaningless.
+ // In the case of not rejecting invalid escapes, though, we preserve it.
+ if (_rejectInvalidEscapes && tryScanEndOfLine())
  return true;
  advance();
 
@@ -790,7 +791,7 @@ bool Scanner::scanEscape()
  addUnicodeAsUTF8(*codepoint);
  return true;
  }
- else if (m_kind != ScannerKind::SpecialComment)
+ else if (_rejectInvalidEscapes)
  return false;
  else
  {
@@ -800,7 +801,7 @@ bool Scanner::scanEscape()
  }
  }
  case 'x':
- if (m_kind != ScannerKind::SpecialComment)
+ if (_rejectInvalidEscapes)
  {
  if (!scanHexByte(c))
  return false;
@@ -814,7 +815,7 @@ bool Scanner::scanEscape()
  }
  break;
  default:
- if (m_kind != ScannerKind::SpecialComment)
+ if (_rejectInvalidEscapes)
  return false;
  else
  {
@@ -860,7 +861,13 @@ Token Scanner::scanString(bool const _isUnicode)
  if (m_kind == ScannerKind::SpecialComment)
  {
  if (c == '\\')
- scanEscape();
+ {
+ if (isSourcePastEndOfInput())
+ return setError(ScannerError::IllegalEscapeSequence);
+ bool const validEscape = scanEscape(false /* _rejectInvalidEscapes */);
+ // there are no invalid escapes in special comments except unterminated backslash at eos
+ solAssert(validEscape);
+ }
  else
  addLiteralChar(c);
  }

diff --git a/liblangutil/Scanner.h b/liblangutil/Scanner.h
@@ -246,7 +246,7 @@ class Scanner
  /// Scans an escape-sequence which is part of a string and adds the
  /// decoded character to the current literal. Returns true if a pattern
  /// is scanned.
- bool scanEscape();
+ bool scanEscape(bool _rejectInvalidEscapes = true);
 
  /// @returns true iff we are currently positioned at a unicode line break.
  bool isUnicodeLinebreak();

diff --git a/test/liblangutil/Scanner.cpp b/test/liblangutil/Scanner.cpp
@@ -1047,11 +1047,19 @@ a Ʃtest\f)");
  BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
 }
 
-BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
+BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_escape_sequence_at_eos)
 {
  CharStream stream(R"("test\)", "");
  Scanner scanner(stream, ScannerKind::SpecialComment);
  BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
+ BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalEscapeSequence);
+}
+
+BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
+{
+ CharStream stream(R"("test)", "");
+ Scanner scanner(stream, ScannerKind::SpecialComment);
+ BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
  BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalStringEndQuote);
 }
 

diff --git a/test/libyul/Parser.cpp b/test/libyul/Parser.cpp
@@ -814,8 +814,6 @@ BOOST_AUTO_TEST_CASE(customSourceLocations_invalid_escapes)
  EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{});
  std::shared_ptr<Block> result = parse(sourceText, dialect, reporter);
  BOOST_REQUIRE(!!result && errorList.size() == 0);
- // the second source location is not parsed as such, as the hex string isn't interpreted as snippet but
- // as the beginning of the tail in AsmParser
  CHECK_LOCATION(result->debugData->originLocation, "source0", 111, 222);
 }
 
@@ -831,8 +829,6 @@ BOOST_AUTO_TEST_CASE(customSourceLocations_single_quote_snippet_with_whitespaces
  EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{});
  std::shared_ptr<Block> result = parse(sourceText, dialect, reporter);
  BOOST_REQUIRE(!!result && errorList.size() == 0);
- // the second source location is not parsed as such, as the hex string isn't interpreted as snippet but
- // as the beginning of the tail in AsmParser
  CHECK_LOCATION(result->debugData->originLocation, "source1", 222, 333);
 }