From 42acccd91c94f438410ff7009c1925963d8bcd19 Mon Sep 17 00:00:00 2001 From: Raiki Tamura Date: Sun, 6 Aug 2023 19:17:17 +0900 Subject: [PATCH] gccrs: clean up Codepoint and InputSource gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license Signed-off-by: Raiki Tamura --- gcc/rust/lex/rust-input-source.h | 70 +++++++++++++++++-------- gcc/rust/{lex => util}/rust-codepoint.h | 0 gcc/rust/util/rust-unicode.cc | 18 +++++++ 3 files changed, 65 insertions(+), 23 deletions(-) rename gcc/rust/{lex => util}/rust-codepoint.h (100%) diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h index 07137debb8f3..32261a05cae3 100644 --- a/gcc/rust/lex/rust-input-source.h +++ b/gcc/rust/lex/rust-input-source.h @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #ifndef RUST_INPUT_SOURCE_H #define RUST_INPUT_SOURCE_H @@ -5,6 +23,14 @@ #include "optional.h" namespace Rust { + +constexpr uint8_t UTF8_BOM1 = 0xEF; +constexpr uint8_t UTF8_BOM2 = 0xBB; +constexpr uint8_t UTF8_BOM3 = 0xBF; + +constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; +constexpr uint32_t CODEPOINT_INVALID = 0xFFFE; + // Input source wrapper thing. class InputSource { @@ -23,7 +49,7 @@ class InputSource if ((int32_t) input == EOF) return Codepoint::eof (); - else if (input < 128) + else if (input <= MAX_ASCII_CODEPOINT) { // ascii -- 1 byte return {input}; @@ -31,14 +57,14 @@ class InputSource else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) - return {0xFFFE}; + return {CODEPOINT_INVALID}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; @@ -50,23 +76,23 @@ class InputSource // If the second byte is equal to 0xBB then the input is no longer a // valid UTF-8 char. Then, we check if the third byte makes up a UTF // BOM. - if (input == 0xEF && input2 == 0xBB) + if (input == UTF8_BOM1 && input2 == UTF8_BOM2) { uint8_t input3 = next_byte (); - if (input3 == 0xBF) + if (input3 == UTF8_BOM3) // found BOM return next_codepoint (); else - return {0xFFFE}; + return {CODEPOINT_INVALID}; } if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); @@ -77,15 +103,15 @@ class InputSource // 4 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input4 = next_byte (); if ((input4 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); @@ -93,23 +119,26 @@ class InputSource } else { - return {0xFFFE}; + return {CODEPOINT_INVALID}; } } protected: - // Check if the input source is valid as utf-8 and copy all characters to - // `chars`. + // This method must be called by the constructor to initialize the input + // source. We cannot move this to the constructor because it calls a + // virtual method . void init () { + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. Codepoint char32 = next_codepoint (); - while (!char32.is_eof () && char32 != 0xFFFE) + while (!char32.is_eof () && char32 != CODEPOINT_INVALID) { chars.push_back (char32); char32 = next_codepoint (); } - if (char32 == 0xFFFE) + if (char32 == CODEPOINT_INVALID) { // Input source is not valid as utf-8. is_valid_utf8 = false; @@ -158,11 +187,7 @@ class FileInputSource : public InputSource public: // Create new input source from file. - FileInputSource (FILE *input) : InputSource (), input (input) - { - // TODO make this better? - init (); - } + FileInputSource (FILE *input) : InputSource (), input (input) { init (); } }; class BufferInputSource : public InputSource @@ -175,7 +200,7 @@ class BufferInputSource : public InputSource { if (offs >= buffer.size ()) return EOF; - return (uint8_t) buffer.at (offs++); + return static_cast (buffer.at (offs++)); } public: @@ -183,7 +208,6 @@ class BufferInputSource : public InputSource BufferInputSource (const std::string &b, size_t offset) : InputSource (), buffer (b), offs (offset) { - // TODO make this better? init (); } }; diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h similarity index 100% rename from gcc/rust/lex/rust-codepoint.h rename to gcc/rust/util/rust-codepoint.h diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index b2ddaf0b9cec..95653cb760db 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #include "rust-system.h" #include "optional.h" #include "selftest.h"