gccrs: clean up Codepoint and InputSource

gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Rust-GCC · Aug 9, 2023 · dd9ab30 · dd9ab30
1 parent 0d678b2
commit dd9ab30
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 23 deletions.
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
@@ -1,10 +1,36 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
 #ifndef RUST_INPUT_SOURCE_H
 #define RUST_INPUT_SOURCE_H
 
 #include "rust-codepoint.h"
 #include "optional.h"
 
 namespace Rust {
+
+constexpr uint8_t UTF8_BOM1 = 0xEF;
+constexpr uint8_t UTF8_BOM2 = 0xBB;
+constexpr uint8_t UTF8_BOM3 = 0xBF;
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;
+
 // Input source wrapper thing.
 class InputSource
 {
@@ -23,22 +49,22 @@ class InputSource
 
  if ((int32_t) input == EOF)
  return Codepoint::eof ();
- else if (input < 128)
+ else if (input <= MAX_ASCII_CODEPOINT)
  {
  // ascii -- 1 byte
  return {input};
  }
  else if ((input & 0xC0) == 0x80)
  {
  // invalid (continuation; can't be first char)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
  }
  else if ((input & 0xE0) == 0xC0)
  {
  // 2 bytes
  uint8_t input2 = next_byte ();
  if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
  return output;
@@ -50,23 +76,23 @@ class InputSource
  // If the second byte is equal to 0xBB then the input is no longer a
  // valid UTF-8 char. Then, we check if the third byte makes up a UTF
  // BOM.
- if (input == 0xEF && input2 == 0xBB)
+ if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
  {
  uint8_t input3 = next_byte ();
- if (input3 == 0xBF)
+ if (input3 == UTF8_BOM3)
  // found BOM
  return next_codepoint ();
  else
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
  }
 
  if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint8_t input3 = next_byte ();
 
  if ((input3 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
  | ((input3 & 0x3F) << 0);
@@ -77,39 +103,42 @@ class InputSource
  // 4 bytes
  uint8_t input2 = next_byte ();
  if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint8_t input3 = next_byte ();
  if ((input3 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint8_t input4 = next_byte ();
  if ((input4 & 0xC0) != 0x80)
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
 
  uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
  return {output};
  }
  else
  {
- return {0xFFFE};
+ return {CODEPOINT_INVALID};
  }
  }
 
 protected:
- // Check if the input source is valid as utf-8 and copy all characters to
- // `chars`.
+ // This method must be called by the constructor to initialize the input
+ // source. We cannot move this to the constructor because it calls a
+ // virtual method .
  void init ()
  {
+ // Check if the input source is valid as utf-8 and copy all characters to
+ // `chars`.
  Codepoint char32 = next_codepoint ();
- while (!char32.is_eof () && char32 != 0xFFFE)
+ while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
  {
  chars.push_back (char32);
  char32 = next_codepoint ();
  }
 
- if (char32 == 0xFFFE)
+ if (char32 == CODEPOINT_INVALID)
  {
  // Input source is not valid as utf-8.
  is_valid_utf8 = false;
@@ -158,11 +187,7 @@ class FileInputSource : public InputSource
 
 public:
  // Create new input source from file.
- FileInputSource (FILE *input) : InputSource (), input (input)
- {
- // TODO make this better?
- init ();
- }
+ FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
 };
 
 class BufferInputSource : public InputSource
@@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
  {
  if (offs >= buffer.size ())
  return EOF;
- return (uint8_t) buffer.at (offs++);
+ return static_cast<uint8_t> (buffer.at (offs++));
  }
 
 public:
  // Create new input source from file.
  BufferInputSource (const std::string &b, size_t offset)
  : InputSource (), buffer (b), offs (offset)
  {
- // TODO make this better?
  init ();
  }
 };

diff --git a/gcc/rust/lex/rust-codepoint.h → gcc/rust/util/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h → gcc/rust/util/rust-codepoint.h
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"