gccrs: add utf-8 validation for input source

gcc/rust/ChangeLog: * lex/rust-lex.cc (Lexer::input_source_is_valid_utf8): New method of `Lexer`. * lex/rust-lex.h: Likewise. * rust-session-manager.cc (Session::compile_crate): Add error. gcc/testsuite/ChangeLog: * rust/compile/broken_utf8.rs: New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Rust-GCC · Jul 6, 2023 · 46a61f0 · 46a61f0
1 parent 5e735e9
commit 46a61f0
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 7 deletions.
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
@@ -167,6 +167,12 @@ Lexer::~Lexer ()
  // line_map->stop();
 }
 
+bool
+Lexer::input_source_is_valid_utf8 ()
+{
+ return raw_input_source->is_valid ();
+}
+
 /* TODO: need to optimise somehow to avoid the virtual function call in the
  * tight loop. Best idea at the moment is CRTP, but that might make lexer
  * implementation annoying when storing the "base class" (i.e. would need

diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
@@ -175,6 +175,8 @@ class Lexer
  Lexer (Lexer &&other) = default;
  Lexer &operator= (Lexer &&other) = default;
 
+ bool input_source_is_valid_utf8 ();
+
  // Returns token n tokens ahead of current position.
  const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
  // Peeks the current token.
@@ -217,9 +219,9 @@ class Lexer
 
  Codepoint next_codepoint ()
  {
- uint8_t input = next_byte ();
+ uint32_t input = next_byte ();
 
- if ((int8_t) input == EOF)
+ if ((int32_t) input == EOF)
  return Codepoint::eof ();
  else if (input < 128)
  {
@@ -246,11 +248,13 @@ class Lexer
  // 3 bytes or UTF-8 BOM
  uint8_t input2 = next_byte ();
  // If the second byte is equal to 0xBB then the input is no longer a
- // valid UTF-8 char.
+ // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+ // BOM.
  if (input == 0xEF && input2 == 0xBB)
  {
  uint8_t input3 = next_byte ();
  if (input3 == 0xBF)
+ // found BOM
  return next_codepoint ();
  else
  return {0xFFFE};
@@ -289,8 +293,6 @@ class Lexer
  }
  else
  {
- // rust_error_at (get_current_location (),
- // "invalid UTF-8 [SECND] (too long)");
  return {0xFFFE};
  }
  }
@@ -362,8 +364,7 @@ class Lexer
  {
  if (offs >= buffer.size ())
  return EOF;
-
- return buffer.at (offs++);
+ return (uint8_t) buffer.at (offs++);
  }
 
  public:

diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
@@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)
 
  Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);
 
+ if (!lex.input_source_is_valid_utf8 ())
+ {
+ rust_error_at (Linemap::unknown_location (),
+ "cannot read %s; stream did not contain valid UTF-8",
+ filename);
+ return;
+ }
+
  Parser<Lexer> parser (lex);
 
  // generate crate from parser

diff --git a/gcc/testsuite/rust/compile/broken_utf8.rs b/gcc/testsuite/rust/compile/broken_utf8.rs
@@ -0,0 +1,2 @@
+// { dg-excess-errors "stream did not contain valid UTF-8" }
+ÿ