Skip to content

Commit

Permalink
gccrs: add utf-8 validation for input source
Browse files Browse the repository at this point in the history
gcc/rust/ChangeLog:

	* lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`.
	* lex/rust-lex.h: Likewise.
	* rust-session-manager.cc (Session::compile_crate): Add error.

gcc/testsuite/ChangeLog:

	* rust/compile/broken_utf8.rs: New test.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
  • Loading branch information
tamaroning authored and philberty committed Jul 6, 2023
1 parent 5e735e9 commit 46a61f0
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 7 deletions.
6 changes: 6 additions & 0 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ Lexer::~Lexer ()
// line_map->stop();
}

bool
Lexer::input_source_is_valid_utf8 ()
{
return raw_input_source->is_valid ();
}

/* TODO: need to optimise somehow to avoid the virtual function call in the
* tight loop. Best idea at the moment is CRTP, but that might make lexer
* implementation annoying when storing the "base class" (i.e. would need
Expand Down
15 changes: 8 additions & 7 deletions gcc/rust/lex/rust-lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ class Lexer
Lexer (Lexer &&other) = default;
Lexer &operator= (Lexer &&other) = default;

bool input_source_is_valid_utf8 ();

// Returns token n tokens ahead of current position.
const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
// Peeks the current token.
Expand Down Expand Up @@ -217,9 +219,9 @@ class Lexer

Codepoint next_codepoint ()
{
uint8_t input = next_byte ();
uint32_t input = next_byte ();

if ((int8_t) input == EOF)
if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
{
Expand All @@ -246,11 +248,13 @@ class Lexer
// 3 bytes or UTF-8 BOM
uint8_t input2 = next_byte ();
// If the second byte is equal to 0xBB then the input is no longer a
// valid UTF-8 char.
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
// BOM.
if (input == 0xEF && input2 == 0xBB)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
// found BOM
return next_codepoint ();
else
return {0xFFFE};
Expand Down Expand Up @@ -289,8 +293,6 @@ class Lexer
}
else
{
// rust_error_at (get_current_location (),
// "invalid UTF-8 [SECND] (too long)");
return {0xFFFE};
}
}
Expand Down Expand Up @@ -362,8 +364,7 @@ class Lexer
{
if (offs >= buffer.size ())
return EOF;

return buffer.at (offs++);
return (uint8_t) buffer.at (offs++);
}

public:
Expand Down
8 changes: 8 additions & 0 deletions gcc/rust/rust-session-manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)

Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);

if (!lex.input_source_is_valid_utf8 ())
{
rust_error_at (Linemap::unknown_location (),
"cannot read %s; stream did not contain valid UTF-8",
filename);
return;
}

Parser<Lexer> parser (lex);

// generate crate from parser
Expand Down
2 changes: 2 additions & 0 deletions gcc/testsuite/rust/compile/broken_utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// { dg-excess-errors "stream did not contain valid UTF-8" }
ÿ

0 comments on commit 46a61f0

Please sign in to comment.