Skip to content

Commit

Permalink
[text_encoding] added support to decode 4 bytes UTF8 symbol.
Browse files Browse the repository at this point in the history
Function text_encoding_decode_UTF8_single now support to decode 4 bytes
to get UTF8 symbol.

[tests_text_encoding] uncomment test cases for the
text_encoding_decode_UTF8_single function.

Enabled text_encoding_encode_decode_UTF8 and text_encoding_encode_UTF8
tests.
  • Loading branch information
TheVice committed Apr 12, 2024
1 parent 13646e7 commit 9a9bfd5
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 16 deletions.
4 changes: 2 additions & 2 deletions tests.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10989,7 +10989,7 @@ Approximate round trip times in milli-seconds:
<return>3</return>
<output>0xFFFF</output>
</text_encoding_decode_UTF8_single>
<!--text_encoding_decode_UTF8_single>
<text_encoding_decode_UTF8_single>
<input>0xF0 0x90 0x80 0x80</input>
<return>4</return>
<output>0x10000</output>
Expand All @@ -11003,7 +11003,7 @@ Approximate round trip times in milli-seconds:
<input>0xF4 0x8F 0xBF 0xBF</input>
<return>4</return>
<output>0x10FFFF</output>
</text_encoding_decode_UTF8_single-->
</text_encoding_decode_UTF8_single>
<text_encoding_UTF_to_UTF8>
<encoding>UTF7</encoding>
</text_encoding_UTF_to_UTF8>
Expand Down
23 changes: 15 additions & 8 deletions tests_text_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ TEST_F(TestTextEncoding, text_encoding_UTF16LE_from_code_page)
buffer_release(output);
}

TEST(TestTextEncoding_, DISABLED_text_encoding_encode_decode_UTF8)
TEST(TestTextEncoding_, text_encoding_encode_decode_UTF8)
{
uint8_t output[6];

Expand All @@ -358,15 +358,22 @@ TEST(TestTextEncoding_, DISABLED_text_encoding_encode_decode_UTF8)
{
const auto stored_o = text_encoding_decode_UTF8_single(
output, output + stored, &o);
ASSERT_EQ(stored, stored_o);
ASSERT_EQ(i, o);
ASSERT_EQ(stored, stored_o) << i;
ASSERT_EQ(i, o) << i;
}
else
{
ASSERT_EQ(3, stored);
ASSERT_EQ(0xEF, output[0]);
ASSERT_EQ(0xBF, output[1]);
ASSERT_EQ(0xBD, output[2]);
if (0xEF < output[0])
{
ASSERT_EQ(4, stored) << i;
}
else
{
ASSERT_EQ(3, stored) << i;
ASSERT_EQ(0xEF, output[0]) << i;
ASSERT_EQ(0xBF, output[1]) << i;
ASSERT_EQ(0xBD, output[2]) << i;
}
}
}
}
Expand Down Expand Up @@ -474,7 +481,7 @@ TEST_F(TestTextEncoding, text_encoding_decode_UTF8_single)
}
}

TEST(TestTextEncoding_, DISABLED_text_encoding_encode_UTF8)
TEST(TestTextEncoding_, text_encoding_encode_UTF8)
{
std::string input_buffer(buffer_size_of(), 0);
auto input = reinterpret_cast<void*>(&input_buffer[0]);
Expand Down
115 changes: 109 additions & 6 deletions text_encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,11 @@ uint8_t text_encoding_is_valid_octet_(uint8_t input)
return 0x9F < input && input < 0xC0;
}

uint8_t text_encoding_is_valid_octet(uint8_t input)
{
return 0x7F < input && input < 0xC0;
}

uint8_t text_encoding_decode_UTF8_single(
const uint8_t* input_start, const uint8_t* input_finish, uint32_t* output)
{
Expand All @@ -783,7 +788,82 @@ uint8_t text_encoding_decode_UTF8_single(
return 0;
}

#ifdef TODO
uint8_t input[5];
input[0] = *input_start;
*output = UTF16LE_UNKNOWN_CHAR;

if (input[0] < 0x80)
{
*output = input[0];
return 1;
}
else if (input[0] < 0xC0 || 0xF7 < input[0])
{
return 1;
}

if (0xEF < input[0])
{
input[4] = 4;
}
else if (0xDF < input[0])
{
input[4] = 3;
}
else if (0xBF < input[0])
{
input[4] = 2;
}
else
{
return 1;
}

input[1] = (uint8_t)(input_finish - input_start);

if (input[1] < input[4])
{
return 1;
}

const uint8_t* start = input_start;
const uint8_t* finish = input_start + input[4];

while (++start < finish)
{
if (!text_encoding_is_valid_octet(*start))
{
return (uint8_t)(start - input_start);
}

input[start - input_start] = *start;
}

input[0] <<= input[4];
input[0] >>= input[4];
/*
--count;*/

while (start < input_finish)
{
if (!text_encoding_is_valid_octet_(*start))
{
break;
}

--count;
}

if (0 < count)
{
*output = UTF16LE_UNKNOWN_CHAR;
return start - input_start;
}

return 1;
#else
* output = UTF16LE_UNKNOWN_CHAR;
const uint8_t octet_1 = *input_start;

if (octet_1 < 0x80)
Expand All @@ -796,14 +876,22 @@ uint8_t text_encoding_decode_UTF8_single(
return 1;
}

uint8_t octet_2 = 0;
uint8_t octet_3 = 0;
uint8_t* octets[2];
uint8_t octet_2, octet_3, octet_4, count = 0, max_count;
uint8_t* octets[3];
octets[0] = &octet_2;
octets[1] = &octet_3;
uint8_t count = 0;
octets[2] = &octet_4;

if (0xEF < octet_1)
{
max_count = 3;
}
else
{
max_count = 2;
}

while (++input_start < input_finish && count < 2)
while (++input_start < input_finish && count < max_count)
{
const uint8_t input_code = *input_start;

Expand All @@ -815,7 +903,21 @@ uint8_t text_encoding_decode_UTF8_single(
*(octets[count++]) = input_code;
}

if (2 == count && (0xDF < octet_1 && octet_1 < 0xF0))
if (3 == count && (0xEF < octet_1 && octet_1 < 0xF8))
{
if (!text_encoding_is_valid_octet(octet_2) ||
!text_encoding_is_valid_octet(octet_3) ||
!text_encoding_is_valid_octet(octet_4))
{
return count;
}

*output = 0x40000 * (octet_1 & 0x07);
*output += 0x1000 * (octet_2 & 0x3F);
*output += 0x40 * (octet_3 & 0x3F);
*output += octet_4 & 0x3F;
}
else if (2 == count && (0xDF < octet_1 && octet_1 < 0xF0))
{
if (octet_1 < 0xE1 && !text_encoding_is_valid_octet_(octet_2))
{
Expand All @@ -833,6 +935,7 @@ uint8_t text_encoding_decode_UTF8_single(
}

return 1 + count;
#endif
}

uint8_t text_encoding_decode_UTF8(
Expand Down

0 comments on commit 9a9bfd5

Please sign in to comment.