Skip to content

Commit

Permalink
Do not skip control characters embedded in malformed UTF-8 characters…
Browse files Browse the repository at this point in the history
… in comments (#1059)
  • Loading branch information
udif authored Jul 15, 2024
1 parent 68ead94 commit b5b8359
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
9 changes: 8 additions & 1 deletion source/parsing/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1306,7 +1306,14 @@ bool Lexer::scanUTF8Char(bool alreadyErrored, uint32_t* code, int& computedLen)
}

if (error) {
errorCount++;
// if error, trim next pointer so that control char is read as next char
if ((computedLen > 1) && (curr[1] < 0x20))
sourceBuffer = curr + 1;
else if ((computedLen > 2) && (curr[2] < 0x20))
sourceBuffer = curr + 2;
else if ((computedLen > 3) && (curr[3] < 0x20))
sourceBuffer = curr + 3;

if (!alreadyErrored)
addDiag(diag::InvalidUTF8Seq, (size_t)(curr - originalBegin));
return false;
Expand Down
43 changes: 43 additions & 0 deletions tests/unittests/parsing/LexerTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,49 @@ TEST_CASE("Line Comment (UTF8)") {
REQUIRE(diagnostics.empty());
}

TEST_CASE("Embedded control characters in a broken UTF8 comment") {
const char text[] = "//\xe0\x80\nendmodule";
Token token = lexToken(text);

CHECK(token.kind == TokenKind::EndModuleKeyword);
CHECK(token.trivia().size() == 2);
CHECK(token.trivia()[0].kind == TriviaKind::LineComment);
CHECK(token.trivia()[1].kind == TriviaKind::EndOfLine);
REQUIRE(diagnostics.size() == 1); // Due to UTF8 intended error
}

TEST_CASE("Embedded control characters in a broken UTF8 comment (2)") {
const char text[] = "//\x82\xe8\nendmodule";
Token token = lexToken(text);

CHECK(token.kind == TokenKind::EndModuleKeyword);
CHECK(token.trivia().size() == 2);
CHECK(token.trivia()[0].kind == TriviaKind::LineComment);
CHECK(token.trivia()[1].kind == TriviaKind::EndOfLine);
REQUIRE(diagnostics.size() == 1); // Due to UTF8 intended error
}

TEST_CASE("Embedded control characters in a broken UTF8 comment not affecting lexer errorCount") {
auto& text = "//\x82\xe8\n//\x82\xe8\n//\x82\xe8\n//\x82\xe8\n//\x82\xe8\n//\x82\xe8\n//"
"\x82\xe8\n//\x82\xe8\nendmodule\n";

LexerOptions options;
options.maxErrors = 4;

diagnostics.clear();
auto buffer = getSourceManager().assignText(text);
Lexer lexer(buffer, alloc, diagnostics, options);
Token token = lexer.lex();

CHECK(token.kind == TokenKind::EndModuleKeyword);
CHECK(token.trivia().size() == 16);
for (int i = 0; i < 8; i++) {
CHECK(token.trivia()[2 * i].kind == TriviaKind::LineComment);
CHECK(token.trivia()[2 * i + 1].kind == TriviaKind::EndOfLine);
}
REQUIRE(diagnostics.size() == 8); // Due to UTF8 intended error
}

TEST_CASE("Block Comment (one line)") {
auto& text = "/* comment */";
Token token = lexToken(text);
Expand Down

0 comments on commit b5b8359

Please sign in to comment.