Skip to content

Commit

Permalink
gccrs: clean up Codepoint and InputSource
Browse files Browse the repository at this point in the history
gcc/rust/ChangeLog:

	* lex/rust-codepoint.h: Moved to...
	* util/rust-codepoint.h: ...here.
	* lex/rust-input-source.h: Add missing license
	* util/rust-unicode.cc: Add missing license

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
  • Loading branch information
tamaroning committed Aug 9, 2023
1 parent 0d678b2 commit dd9ab30
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 23 deletions.
70 changes: 47 additions & 23 deletions gcc/rust/lex/rust-input-source.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {

constexpr uint8_t UTF8_BOM1 = 0xEF;
constexpr uint8_t UTF8_BOM2 = 0xBB;
constexpr uint8_t UTF8_BOM3 = 0xBF;

constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;

// Input source wrapper thing.
class InputSource
{
Expand All @@ -23,22 +49,22 @@ class InputSource

if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
else if (input <= MAX_ASCII_CODEPOINT)
{
// ascii -- 1 byte
return {input};
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
return {0xFFFE};
return {CODEPOINT_INVALID};
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return output;
Expand All @@ -50,23 +76,23 @@ class InputSource
// If the second byte is equal to 0xBB then the input is no longer a
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
// BOM.
if (input == 0xEF && input2 == 0xBB)
if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
if (input3 == UTF8_BOM3)
// found BOM
return next_codepoint ();
else
return {0xFFFE};
return {CODEPOINT_INVALID};
}

if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input3 = next_byte ();

if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
| ((input3 & 0x3F) << 0);
Expand All @@ -77,39 +103,42 @@ class InputSource
// 4 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input3 = next_byte ();
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input4 = next_byte ();
if ((input4 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return {output};
}
else
{
return {0xFFFE};
return {CODEPOINT_INVALID};
}
}

protected:
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
// This method must be called by the constructor to initialize the input
// source. We cannot move this to the constructor because it calls a
// virtual method .
void init ()
{
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
Codepoint char32 = next_codepoint ();
while (!char32.is_eof () && char32 != 0xFFFE)
while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
{
chars.push_back (char32);
char32 = next_codepoint ();
}

if (char32 == 0xFFFE)
if (char32 == CODEPOINT_INVALID)
{
// Input source is not valid as utf-8.
is_valid_utf8 = false;
Expand Down Expand Up @@ -158,11 +187,7 @@ class FileInputSource : public InputSource

public:
// Create new input source from file.
FileInputSource (FILE *input) : InputSource (), input (input)
{
// TODO make this better?
init ();
}
FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
};

class BufferInputSource : public InputSource
Expand All @@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
{
if (offs >= buffer.size ())
return EOF;
return (uint8_t) buffer.at (offs++);
return static_cast<uint8_t> (buffer.at (offs++));
}

public:
// Create new input source from file.
BufferInputSource (const std::string &b, size_t offset)
: InputSource (), buffer (b), offs (offset)
{
// TODO make this better?
init ();
}
};
Expand Down
File renamed without changes.
18 changes: 18 additions & 0 deletions gcc/rust/util/rust-unicode.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
Expand Down

0 comments on commit dd9ab30

Please sign in to comment.