Skip to content

Commit

Permalink
gccrs: Normalize all identifier tokens
Browse files Browse the repository at this point in the history
gcc/rust/ChangeLog:

	* lex/rust-lex.cc (assert_source_content): Fix namespace specifier
	(test_buffer_input_source): Likewise.
	(test_file_input_source): Likewise.
	* lex/rust-lex.h: Move InputSource ...
	* lex/rust-input-source.h: ... to here. (New file)
	* lex/rust-token.cc (nfc_normalize_token_string): New function
	* lex/rust-token.h (nfc_normalize_token_string): New function
	* rust-lang.cc (run_rust_tests): Modify order of selftests.
	* rust-session-manager.cc (validate_crate_name): Modify interface of Utf8String.
	* util/rust-unicode.cc (lookup_cc): Modify codepoint_t typedef.
	(lookup_recomp): Likewise.
	(recursive_decomp_cano): Likewise.
	(decomp_cano): Likewise.
	(sort_cano): Likewise.
	(compose_hangul): Likewise.
	(assert_normalize): Likewise.
	(Utf8String::nfc_normalize): New function.
	* util/rust-unicode.h: Modify interface of Utf8String.

gcc/testsuite/ChangeLog:

	* rust/compile/unicode_norm1.rs: New test.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
  • Loading branch information
tamaroning committed Aug 9, 2023
1 parent f7d9373 commit 2fa4f4a
Show file tree
Hide file tree
Showing 10 changed files with 304 additions and 228 deletions.
193 changes: 193 additions & 0 deletions gcc/rust/lex/rust-input-source.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {
// Input source wrapper thing.
class InputSource
{
private:
// position of current character
unsigned int pos;
std::vector<Codepoint> chars;
bool is_valid_utf8;

// Overload operator () to return next char from input stream.
virtual int next_byte () = 0;

Codepoint next_codepoint ()
{
uint32_t input = next_byte ();

if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
{
// ascii -- 1 byte
return {input};
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
return {0xFFFE};
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return output;
}
else if ((input & 0xF0) == 0xE0)
{
// 3 bytes or UTF-8 BOM
uint8_t input2 = next_byte ();
// If the second byte is equal to 0xBB then the input is no longer a
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
// BOM.
if (input == 0xEF && input2 == 0xBB)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
// found BOM
return next_codepoint ();
else
return {0xFFFE};
}

if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input3 = next_byte ();

if ((input3 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
| ((input3 & 0x3F) << 0);
return {output};
}
else if ((input & 0xF8) == 0xF0)
{
// 4 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input3 = next_byte ();
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input4 = next_byte ();
if ((input4 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return {output};
}
else
{
return {0xFFFE};
}
}

protected:
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
void init ()
{
Codepoint char32 = next_codepoint ();
while (!char32.is_eof () && char32 != 0xFFFE)
{
chars.push_back (char32);
char32 = next_codepoint ();
}

if (char32 == 0xFFFE)
{
// Input source is not valid as utf-8.
is_valid_utf8 = false;
}
}

public:
InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}

virtual ~InputSource () {}

// Checks if input source is a valid UTF-8 string
bool is_valid () { return is_valid_utf8; }

// get the next UTF-8 character
Codepoint next ()
{
if (pos >= chars.size ())
return Codepoint::eof ();
else
{
Codepoint c = chars[pos];
pos++;
return c;
}
}

// Returns codepoint if input source is a valid UTF-8 string. Returns
// nullopt otherwise.
tl::optional<std::vector<Codepoint>> get_chars ()
{
if (is_valid ())
return {chars};
else
return tl::nullopt;
}
};

class FileInputSource : public InputSource
{
private:
// Input source file.
FILE *input;

int next_byte () override { return fgetc (input); }

public:
// Create new input source from file.
FileInputSource (FILE *input) : InputSource (), input (input)
{
// TODO make this better?
init ();
}
};

class BufferInputSource : public InputSource
{
private:
const std::string &buffer;
size_t offs;

int next_byte () override
{
if (offs >= buffer.size ())
return EOF;
return (uint8_t) buffer.at (offs++);
}

public:
// Create new input source from file.
BufferInputSource (const std::string &b, size_t offset)
: InputSource (), buffer (b), offs (offset)
{
// TODO make this better?
init ();
}
};

} // namespace Rust

#endif
7 changes: 3 additions & 4 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2534,8 +2534,7 @@ namespace selftest {

// Checks if `src` has the same contents as the given characters
void
assert_source_content (Rust::Lexer::InputSource &src,
std::vector<uint32_t> expected)
assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
{
Rust::Codepoint src_char = src.next ();
for (auto expected_char : expected)
Expand All @@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
void
test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
{
Rust::Lexer::BufferInputSource source (str, 0);
Rust::BufferInputSource source (str, 0);
assert_source_content (source, expected);
}

Expand All @@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
// Moves to the first character
fputs (str.c_str (), tmpf);
std::rewind (tmpf);
Rust::Lexer::FileInputSource source (tmpf);
Rust::FileInputSource source (tmpf);
assert_source_content (source, expected);
}

Expand Down
Loading

0 comments on commit 2fa4f4a

Please sign in to comment.