gccrs: Normalize all identifier tokens

gcc/rust/ChangeLog: * lex/rust-lex.cc (assert_source_content): Fix namespace specifier (test_buffer_input_source): Likewise. (test_file_input_source): Likewise. * lex/rust-lex.h: Move InputSource ... * lex/rust-input-source.h: ... to here. (New file) * lex/rust-token.cc (nfc_normalize_token_string): New function * lex/rust-token.h (nfc_normalize_token_string): New function * rust-lang.cc (run_rust_tests): Modify order of selftests. * rust-session-manager.cc (validate_crate_name): Modify interface of Utf8String. * util/rust-unicode.cc (lookup_cc): Modify codepoint_t typedef. (lookup_recomp): Likewise. (recursive_decomp_cano): Likewise. (decomp_cano): Likewise. (sort_cano): Likewise. (compose_hangul): Likewise. (assert_normalize): Likewise. (Utf8String::nfc_normalize): New function. * util/rust-unicode.h: Modify interface of Utf8String. gcc/testsuite/ChangeLog: * rust/compile/unicode_norm1.rs: New test. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Rust-GCC · Aug 9, 2023 · 2fa4f4a · 2fa4f4a
1 parent f7d9373
commit 2fa4f4a
Show file tree

Hide file tree

Showing 10 changed files with 304 additions and 228 deletions.
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
@@ -0,0 +1,193 @@
+#ifndef RUST_INPUT_SOURCE_H
+#define RUST_INPUT_SOURCE_H
+
+#include "rust-codepoint.h"
+#include "optional.h"
+
+namespace Rust {
+// Input source wrapper thing.
+class InputSource
+{
+private:
+  // position of current character
+  unsigned int pos;
+  std::vector<Codepoint> chars;
+  bool is_valid_utf8;
+
+  // Overload operator () to return next char from input stream.
+  virtual int next_byte () = 0;
+
+  Codepoint next_codepoint ()
+  {
+    uint32_t input = next_byte ();
+
+    if ((int32_t) input == EOF)
+      return Codepoint::eof ();
+    else if (input < 128)
+      {
+	// ascii -- 1 byte
+	return {input};
+      }
+    else if ((input & 0xC0) == 0x80)
+      {
+	// invalid (continuation; can't be first char)
+	return {0xFFFE};
+      }
+    else if ((input & 0xE0) == 0xC0)
+      {
+	// 2 bytes
+	uint8_t input2 = next_byte ();
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+	return output;
+      }
+    else if ((input & 0xF0) == 0xE0)
+      {
+	// 3 bytes or UTF-8 BOM
+	uint8_t input2 = next_byte ();
+	// If the second byte is equal to 0xBB then the input is no longer a
+	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
+	// BOM.
+	if (input == 0xEF && input2 == 0xBB)
+	  {
+	    uint8_t input3 = next_byte ();
+	    if (input3 == 0xBF)
+	      // found BOM
+	      return next_codepoint ();
+	    else
+	      return {0xFFFE};
+	  }
+
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input3 = next_byte ();
+
+	if ((input3 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+			  | ((input3 & 0x3F) << 0);
+	return {output};
+      }
+    else if ((input & 0xF8) == 0xF0)
+      {
+	// 4 bytes
+	uint8_t input2 = next_byte ();
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input3 = next_byte ();
+	if ((input3 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input4 = next_byte ();
+	if ((input4 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+	return {output};
+      }
+    else
+      {
+	return {0xFFFE};
+      }
+  }
+
+protected:
+  // Check if the input source is valid as utf-8 and copy all characters to
+  // `chars`.
+  void init ()
+  {
+    Codepoint char32 = next_codepoint ();
+    while (!char32.is_eof () && char32 != 0xFFFE)
+      {
+	chars.push_back (char32);
+	char32 = next_codepoint ();
+      }
+
+    if (char32 == 0xFFFE)
+      {
+	// Input source is not valid as utf-8.
+	is_valid_utf8 = false;
+      }
+  }
+
+public:
+  InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
+  virtual ~InputSource () {}
+
+  // Checks if input source is a valid UTF-8 string
+  bool is_valid () { return is_valid_utf8; }
+
+  // get the next UTF-8 character
+  Codepoint next ()
+  {
+    if (pos >= chars.size ())
+      return Codepoint::eof ();
+    else
+      {
+	Codepoint c = chars[pos];
+	pos++;
+	return c;
+      }
+  }
+
+  // Returns codepoint if input source is a valid UTF-8 string. Returns
+  // nullopt otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars ()
+  {
+    if (is_valid ())
+      return {chars};
+    else
+      return tl::nullopt;
+  }
+};
+
+class FileInputSource : public InputSource
+{
+private:
+  // Input source file.
+  FILE *input;
+
+  int next_byte () override { return fgetc (input); }
+
+public:
+  // Create new input source from file.
+  FileInputSource (FILE *input) : InputSource (), input (input)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+class BufferInputSource : public InputSource
+{
+private:
+  const std::string &buffer;
+  size_t offs;
+
+  int next_byte () override
+  {
+    if (offs >= buffer.size ())
+      return EOF;
+    return (uint8_t) buffer.at (offs++);
+  }
+
+public:
+  // Create new input source from file.
+  BufferInputSource (const std::string &b, size_t offset)
+    : InputSource (), buffer (b), offs (offset)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+} // namespace Rust
+
+#endif
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
@@ -2534,8 +2534,7 @@ namespace selftest {
 
 // Checks if `src` has the same contents as the given characters
 void
-assert_source_content (Rust::Lexer::InputSource &src,
-		       std::vector<uint32_t> expected)
+assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
 {
   Rust::Codepoint src_char = src.next ();
   for (auto expected_char : expected)
@@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
 void
 test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
 {
-  Rust::Lexer::BufferInputSource source (str, 0);
+  Rust::BufferInputSource source (str, 0);
   assert_source_content (source, expected);
 }
 
@@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
   // Moves to the first character
   fputs (str.c_str (), tmpf);
   std::rewind (tmpf);
-  Rust::Lexer::FileInputSource source (tmpf);
+  Rust::FileInputSource source (tmpf);
   assert_source_content (source, expected);
 }