From 1492eb517716db9fc8b3299581b6ff970574c628 Mon Sep 17 00:00:00 2001
From: clonker <clonker+gh@gmail.com>
Date: Mon, 1 Jul 2024 11:58:37 +0200
Subject: [PATCH] handle dangling backslashes in special comments and some
 cleanup

---
 Changelog.md                 |  4 ++--
 liblangutil/Scanner.cpp      | 21 ++++++++++++++-------
 liblangutil/Scanner.h        |  2 +-
 test/liblangutil/Scanner.cpp | 10 +++++++++-
 test/libyul/Parser.cpp       |  4 ----
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/Changelog.md b/Changelog.md
index d38eb30e38c0..cba038a56b98 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -3,7 +3,7 @@
 Language Features:
  * Accept declarations of state variables with ``transient`` data location (parser support only, no code generation yet).
  * Make ``require(bool, Error)`` available when using the legacy pipeline.
- * Yul: Parsing rules for source location comments have been relaxed: Whitespace between the indices as well as single-quoted code snippets are now allowed.
+ * Yul: Parsing rules for source location comments have been relaxed: Whitespace between the location components as well as single-quoted code snippets are now allowed.
 
 
 Compiler Features:
@@ -14,12 +14,12 @@ Compiler Features:
 
 
 Bugfixes:
- * AsmParser: Alleviates risk of encountering a segfault for very long comments.
  * SMTChecker: Fix error that reports invalid number of verified checks for BMC and CHC engines.
  * SMTChecker: Fix formatting of unary minus expressions in invariants.
  * SMTChecker: Fix internal compiler error when reporting proved targets for BMC engine.
  * TypeChecker: Fix segfault when assigning nested tuple to tuple.
  * Yul Optimizer: Name simplification could lead to forbidden identifiers with a leading and/or trailing dot, e.g., ``x._`` would get simplified into ``x.``.
+ * Yul Parser: Fix segfault when parsing very long location comments.
 
 
 ### 0.8.26 (2024-05-21)
diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp
index 5789d3cec663..4a51642c16d9 100644
--- a/liblangutil/Scanner.cpp
+++ b/liblangutil/Scanner.cpp
@@ -759,12 +759,13 @@ void Scanner::scanToken()
 	m_tokens[NextNext].extendedTokenInfo = std::make_tuple(m, n);
 }
 
-bool Scanner::scanEscape()
+bool Scanner::scanEscape(bool const _rejectInvalidEscapes)
 {
 	char c = m_char;
 
-	// Skip escaped newlines.
-	if (m_kind != ScannerKind::SpecialComment && tryScanEndOfLine())
+	// Normally we ignore the slash just before a newline since it's meaningless.
+	// In the case of not rejecting invalid escapes, though, we preserve it.
+	if (_rejectInvalidEscapes && tryScanEndOfLine())
 		return true;
 	advance();
 
@@ -790,7 +791,7 @@ bool Scanner::scanEscape()
 			addUnicodeAsUTF8(*codepoint);
 			return true;
 		}
-		else if (m_kind != ScannerKind::SpecialComment)
+		else if (_rejectInvalidEscapes)
 			return false;
 		else
 		{
@@ -800,7 +801,7 @@ bool Scanner::scanEscape()
 		}
 	}
 	case 'x':
-		if (m_kind != ScannerKind::SpecialComment)
+		if (_rejectInvalidEscapes)
 		{
 			if (!scanHexByte(c))
 				return false;
@@ -814,7 +815,7 @@ bool Scanner::scanEscape()
 			}
 		break;
 	default:
-		if (m_kind != ScannerKind::SpecialComment)
+		if (_rejectInvalidEscapes)
 			return false;
 		else
 		{
@@ -860,7 +861,13 @@ Token Scanner::scanString(bool const _isUnicode)
 		if (m_kind == ScannerKind::SpecialComment)
 		{
 			if (c == '\\')
-				scanEscape();
+			{
+				if (isSourcePastEndOfInput())
+					return setError(ScannerError::IllegalEscapeSequence);
+				bool const validEscape = scanEscape(false /* _rejectInvalidEscapes */);
+				// there are no invalid escapes in special comments except unterminated backslash at eos
+				solAssert(validEscape);
+			}
 			else
 				addLiteralChar(c);
 		}
diff --git a/liblangutil/Scanner.h b/liblangutil/Scanner.h
index c734a368ccae..d39867834a27 100644
--- a/liblangutil/Scanner.h
+++ b/liblangutil/Scanner.h
@@ -246,7 +246,7 @@ class Scanner
 	/// Scans an escape-sequence which is part of a string and adds the
 	/// decoded character to the current literal. Returns true if a pattern
 	/// is scanned.
-	bool scanEscape();
+	bool scanEscape(bool _rejectInvalidEscapes = true);
 
 	/// @returns true iff we are currently positioned at a unicode line break.
 	bool isUnicodeLinebreak();
diff --git a/test/liblangutil/Scanner.cpp b/test/liblangutil/Scanner.cpp
index fae024809e69..fb499e05c3fb 100644
--- a/test/liblangutil/Scanner.cpp
+++ b/test/liblangutil/Scanner.cpp
@@ -1047,11 +1047,19 @@ a	Ʃtest\f)");
 	BOOST_REQUIRE(scanner.currentLiteral() == expectedOutput);
 }
 
-BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
+BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_escape_sequence_at_eos)
 {
 	CharStream stream(R"("test\)", "");
 	Scanner scanner(stream, ScannerKind::SpecialComment);
 	BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
+	BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalEscapeSequence);
+}
+
+BOOST_AUTO_TEST_CASE(special_comment_with_unterminated_string)
+{
+	CharStream stream(R"("test)", "");
+	Scanner scanner(stream, ScannerKind::SpecialComment);
+	BOOST_REQUIRE(scanner.currentToken() == Token::Illegal);
 	BOOST_REQUIRE(scanner.currentError() == ScannerError::IllegalStringEndQuote);
 }
 
diff --git a/test/libyul/Parser.cpp b/test/libyul/Parser.cpp
index ecdba2a9552c..ca8784ddb6ef 100644
--- a/test/libyul/Parser.cpp
+++ b/test/libyul/Parser.cpp
@@ -814,8 +814,6 @@ BOOST_AUTO_TEST_CASE(customSourceLocations_invalid_escapes)
 	EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{});
 	std::shared_ptr<Block> result = parse(sourceText, dialect, reporter);
 	BOOST_REQUIRE(!!result && errorList.size() == 0);
-	// the second source location is not parsed as such, as the hex string isn't interpreted as snippet but
-	// as the beginning of the tail in AsmParser
 	CHECK_LOCATION(result->debugData->originLocation, "source0", 111, 222);
 }
 
@@ -831,8 +829,6 @@ BOOST_AUTO_TEST_CASE(customSourceLocations_single_quote_snippet_with_whitespaces
 	EVMDialectTyped const& dialect = EVMDialectTyped::instance(EVMVersion{});
 	std::shared_ptr<Block> result = parse(sourceText, dialect, reporter);
 	BOOST_REQUIRE(!!result && errorList.size() == 0);
-	// the second source location is not parsed as such, as the hex string isn't interpreted as snippet but
-	// as the beginning of the tail in AsmParser
 	CHECK_LOCATION(result->debugData->originLocation, "source1", 222, 333);
 }