Skip to content

Commit

Permalink
[text_encoding] added 4 bytes UTF8 encoding.
Browse files Browse the repository at this point in the history
Function text_encoding_encode_UTF8_single now support to work with code
points up to the 0x110000.
Fixed issue at the text_encoding_decode_UTF16BE_single function with
processing of surrogate pairs.

[tests_text_encoding] disabled test on text_encoding_encode_decode_UTF8
and text_encoding_encode_UTF8 functions. Added test on
text_encoding_encode_UTF8_single and text_encoding_decode_UTF8_single
functions.
Test cases with 4 byte input is disable for test on
text_encoding_decode_UTF8_single function.

Corrected test data for test on text_encoding_UTF_to_UTF8 function -
after adding support to encode 4 bytes UTF8 - no more output of unknown
UTF8 char should be in this case.
Added addition test cases for this test.
  • Loading branch information
TheVice committed Apr 11, 2024
1 parent cdc1c86 commit 9211013
Show file tree
Hide file tree
Showing 4 changed files with 392 additions and 105 deletions.
197 changes: 195 additions & 2 deletions tests.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10841,6 +10841,169 @@ Approximate round trip times in milli-seconds:
<expected_size>2</expected_size>
<output>65023</output>
</text_encoding_encode_UTF16>
<text_encoding_encode_UTF8_single>
<return>1</return>
<output>0x00</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x79</input>
<return>1</return>
<output>0x79</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>121</input>
<return>1</return>
<output>0x79</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x80</input>
<return>2</return>
<output>0xC2 0x80</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x799</input>
<return>2</return>
<output>0xDE 0x99</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x800</input>
<return>3</return>
<output>0xE0 0xA0 0x80</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xD7FF</input>
<return>3</return>
<output>0xED 0x9F 0xBF</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xD800</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xD801</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xDFFE</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xDFFF</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xE000</input>
<return>3</return>
<output>0xEE 0x80 0x80</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0xFFFF</input>
<return>3</return>
<output>0xEF 0xBF 0xBF</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x10000</input>
<return>4</return>
<output>0xF0 0x90 0x80 0x80</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x10001</input>
<return>4</return>
<output>0xF0 0x90 0x80 0x81</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x10FFFF</input>
<return>4</return>
<output>0xF4 0x8F 0xBF 0xBF</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x110000</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_encode_UTF8_single>
<input>0x110001</input>
<return>3</return>
<output>0xEF 0xBF 0xBD</output>
</text_encoding_encode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0x00</input>
<return>1</return>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0x79</input>
<return>1</return>
<output>0x79</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xC2 0x80</input>
<return>2</return>
<output>0x80</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xDE 0x99</input>
<return>2</return>
<output>0x799</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xE0 0xA0 0x80</input>
<return>3</return>
<output>0x800</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xED 0x9F 0xBF</input>
<return>3</return>
<output>0xD7FF</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEF 0xBF 0xBD</input>
<return>3</return>
<output>0xFFFD</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEF 0xBF 0xBD</input>
<return>3</return>
<output>0xFFFD</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEF 0xBF 0xBD</input>
<return>3</return>
<output>0xFFFD</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEF 0xBF 0xBD</input>
<return>3</return>
<output>0xFFFD</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEE 0x80 0x80</input>
<return>3</return>
<output>0xE000</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xEF 0xBF 0xBF</input>
<return>3</return>
<output>0xFFFF</output>
</text_encoding_decode_UTF8_single>
<!--text_encoding_decode_UTF8_single>
<input>0xF0 0x90 0x80 0x80</input>
<return>4</return>
<output>0x10000</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xF0 0x90 0x80 0x81</input>
<return>4</return>
<output>0x10001</output>
</text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xF4 0x8F 0xBF 0xBF</input>
<return>4</return>
<output>0x10FFFF</output>
</text_encoding_decode_UTF8_single-->
<text_encoding_UTF_to_UTF8>
<encoding>UTF7</encoding>
</text_encoding_UTF_to_UTF8>
Expand Down Expand Up @@ -10961,10 +11124,40 @@ Approximate round trip times in milli-seconds:
<output>0x64 0x6F 0x67 0x2E </output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x30 0x53 0x74 0x30 0x30 0x2C 0x30 0x79 0x0A 0xD8 0x42 0xDF </input>
<input>0x30 0x53 0x74 0x30 0x30 0x2C 0x30 0x79 0x0A 0xD8 0x42 0xDF</input>
<encoding>UTF16LE</encoding>
<return>1</return>
<output>0xE5 0x8C 0xB0 0xE3 0x81 0xB4 0xE2 0xB0 0xB0 0xE7 0xA4 0xB0 0xEF 0xBF 0xBD </output>
<output>0xE5 0x8C 0xB0 0xE3 0x81 0xB4 0xE2 0xB0 0xB0 0xE7 0xA4 0xB0 0xF0 0x92 0xAD 0x82</output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x00 0x00 0x00 0x41 0x00 0x00 0x03 0xA9 0x00 0x00 0x8A 0x9E 0x00 0x01 0x03 0x84</input>
<encoding>UTF32BE</encoding>
<return>1</return>
<output>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x41 0x00 0x00 0x00 0xA9 0x03 0x00 0x00 0x9E 0x8A 0x00 0x00 0x84 0x03 0x01 0x00</input>
<encoding>UTF32LE</encoding>
<return>1</return>
<output>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x00 0x41 0x03 0xA9 0x8A 0x9E 0xD8 0x00 0xDF 0x84</input>
<encoding>UTF16BE</encoding>
<return>1</return>
<output>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x41 0x00 0xA9 0x03 0x9E 0x8A 0x00 0xD8 0x84 0xDF</input>
<encoding>UTF16LE</encoding>
<return>1</return>
<output>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</output>
</text_encoding_UTF_to_UTF8>
<text_encoding_UTF_to_UTF8>
<input>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</input>
<encoding>UTF8</encoding>
<return>1</return>
<output>0x41 0xCE 0xA9 0xE8 0xAA 0x9E 0xF0 0x90 0x8E 0x84</output>
</text_encoding_UTF_to_UTF8>
</TestTextEncoding>
<TestVersion>
Expand Down
107 changes: 105 additions & 2 deletions tests_text_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ TEST_F(TestTextEncoding, text_encoding_UTF16LE_from_code_page)
buffer_release(output);
}

TEST(TestTextEncoding_, text_encoding_encode_decode_UTF8)
TEST(TestTextEncoding_, DISABLED_text_encoding_encode_decode_UTF8)
{
uint8_t output[6];

Expand All @@ -371,7 +371,110 @@ TEST(TestTextEncoding_, text_encoding_encode_decode_UTF8)
}
}

TEST(TestTextEncoding_, text_encoding_encode_UTF8)
TEST_F(TestTextEncoding, text_encoding_encode_UTF8_single)
{
static uint32_t input;
static uint8_t output[4];
std::vector<uint8_t> expected_output;

for (const auto& node : nodes)
{
range in_a_range;
const auto the_node = node.node();
//
const std::string input_hex(the_node.select_node("input").node().child_value());
const char* input_ = input_hex.c_str();

if (nullptr != input_ && nullptr != (input_ = std::strstr(input_, "0x")))
{
char* pos;
input = static_cast<uint32_t>(std::strtoul(input_, &pos, 16));
}
else if (!input_hex.empty())
{
in_a_range = string_to_range(input_hex);
input = static_cast<uint32_t>(uint64_parse(in_a_range.start, in_a_range.finish));
}
else
{
input = 0;
}

const std::string return_str(the_node.select_node("return").node().child_value());
in_a_range = string_to_range(return_str);
const auto expected_return =
static_cast<uint8_t>(
int_parse(in_a_range.start, in_a_range.finish));
//
const auto output_hex(get_data_from_nodes(the_node, "output"));
expected_output.clear();
string_hex_parse(output_hex.c_str(), expected_output);
//
const auto returned = text_encoding_encode_UTF8_single(input, output);
//
ASSERT_EQ(expected_return, returned) << input_hex;
ASSERT_EQ(returned, expected_output.size()) << input_hex;

for (uint8_t i = 0, count = static_cast<uint8_t>(expected_output.size()); i < count; ++i)
{
ASSERT_EQ(expected_output[i], output[i])
<< input_hex << std::endl << i << std::endl;
}

--node_count;
}
}

TEST_F(TestTextEncoding, text_encoding_decode_UTF8_single)
{
std::vector<uint8_t> input;
static uint32_t output, expected_output;

for (const auto& node : nodes)
{
const auto the_node = node.node();
//
const auto input_hex(get_data_from_nodes(the_node, "input"));
input.clear();
string_hex_parse(input_hex.c_str(), input);
//
const std::string return_str(the_node.select_node("return").node().child_value());
range in_a_range = string_to_range(return_str);
const auto expected_return =
static_cast<uint8_t>(
int_parse(in_a_range.start, in_a_range.finish));
//
const std::string output_hex(the_node.select_node("output").node().child_value());
const char* output_ = output_hex.c_str();

if (nullptr != output_ && nullptr != (output_ = std::strstr(output_, "0x")))
{
char* pos;
expected_output = static_cast<uint32_t>(std::strtoul(output_, &pos, 16));
}
else if (!output_hex.empty())
{
in_a_range = string_to_range(output_hex);
expected_output = static_cast<uint32_t>(uint64_parse(in_a_range.start, in_a_range.finish));
}
else
{
expected_output = 0;
}

in_a_range.start = input.data();
in_a_range.finish = input.data() + input.size();
//
const auto returned = text_encoding_decode_UTF8_single(in_a_range.start, in_a_range.finish, &output);
//
ASSERT_EQ(expected_return, returned) << input_hex;
ASSERT_EQ(expected_output, output) << input_hex;
//
--node_count;
}
}

TEST(TestTextEncoding_, DISABLED_text_encoding_encode_UTF8)
{
std::string input_buffer(buffer_size_of(), 0);
auto input = reinterpret_cast<void*>(&input_buffer[0]);
Expand Down
Loading

0 comments on commit 9211013

Please sign in to comment.