Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move Codepoint to gcc/rust/util/ #2530

Merged
merged 1 commit into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 47 additions & 23 deletions gcc/rust/lex/rust-input-source.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {

constexpr uint8_t UTF8_BOM1 = 0xEF;
constexpr uint8_t UTF8_BOM2 = 0xBB;
constexpr uint8_t UTF8_BOM3 = 0xBF;

constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;

// Input source wrapper thing.
class InputSource
{
Expand All @@ -23,22 +49,22 @@ class InputSource

if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
else if (input <= MAX_ASCII_CODEPOINT)
{
// ascii -- 1 byte
return {input};
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
return {0xFFFE};
return {CODEPOINT_INVALID};
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return output;
Expand All @@ -50,23 +76,23 @@ class InputSource
// If the second byte is equal to 0xBB then the input is no longer a
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
// BOM.
if (input == 0xEF && input2 == 0xBB)
if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
if (input3 == UTF8_BOM3)
// found BOM
return next_codepoint ();
else
return {0xFFFE};
return {CODEPOINT_INVALID};
}

if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input3 = next_byte ();

if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
| ((input3 & 0x3F) << 0);
Expand All @@ -77,39 +103,42 @@ class InputSource
// 4 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input3 = next_byte ();
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint8_t input4 = next_byte ();
if ((input4 & 0xC0) != 0x80)
return {0xFFFE};
return {CODEPOINT_INVALID};

uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return {output};
}
else
{
return {0xFFFE};
return {CODEPOINT_INVALID};
}
}

protected:
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
// This method must be called by the constructor to initialize the input
// source. We cannot move this to the constructor because it calls a
// virtual method .
void init ()
{
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
Codepoint char32 = next_codepoint ();
while (!char32.is_eof () && char32 != 0xFFFE)
while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
{
chars.push_back (char32);
char32 = next_codepoint ();
}

if (char32 == 0xFFFE)
if (char32 == CODEPOINT_INVALID)
{
// Input source is not valid as utf-8.
is_valid_utf8 = false;
Expand Down Expand Up @@ -158,11 +187,7 @@ class FileInputSource : public InputSource

public:
// Create new input source from file.
FileInputSource (FILE *input) : InputSource (), input (input)
{
// TODO make this better?
init ();
}
FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
};

class BufferInputSource : public InputSource
Expand All @@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
{
if (offs >= buffer.size ())
return EOF;
return (uint8_t) buffer.at (offs++);
return static_cast<uint8_t> (buffer.at (offs++));
}

public:
// Create new input source from file.
BufferInputSource (const std::string &b, size_t offset)
: InputSource (), buffer (b), offs (offset)
{
// TODO make this better?
init ();
}
};
Expand Down
File renamed without changes.
4 changes: 1 addition & 3 deletions gcc/rust/util/rust-punycode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;
constexpr uint32_t INITIAL_N = 128;
constexpr char DELIMITER = '-';

constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;

std::string
extract_basic_string (const std::vector<Codepoint> &src)
{
std::string basic_string;
for (auto c : src)
{
if (c.value <= MAX_ASCII_CODEPOINT)
if (c.value <= 0x7F)
basic_string += c.as_string ();
}
return basic_string;
Expand Down
18 changes: 18 additions & 0 deletions gcc/rust/util/rust-unicode.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
Expand Down
Loading