[text_encoding] added support to decode 4 bytes UTF8 symbol.

Function text_encoding_decode_UTF8_single now support to decode 4 bytes to get UTF8 symbol. [tests_text_encoding] uncomment test cases for the text_encoding_decode_UTF8_single function. Enabled text_encoding_encode_decode_UTF8 and text_encoding_encode_UTF8 tests.
TheVice · Apr 12, 2024 · 9a9bfd5 · 9a9bfd5
1 parent 13646e7
commit 9a9bfd5
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 16 deletions.
diff --git a/tests.xml b/tests.xml
@@ -10989,7 +10989,7 @@ Approximate round trip times in milli-seconds:
       <return>3</return>
       <output>0xFFFF</output>
     </text_encoding_decode_UTF8_single>
-    <!--text_encoding_decode_UTF8_single>
+    <text_encoding_decode_UTF8_single>
       <input>0xF0 0x90 0x80 0x80</input>
       <return>4</return>
       <output>0x10000</output>
@@ -11003,7 +11003,7 @@ Approximate round trip times in milli-seconds:
       <input>0xF4 0x8F 0xBF 0xBF</input>
       <return>4</return>
       <output>0x10FFFF</output>
-    </text_encoding_decode_UTF8_single-->
+    </text_encoding_decode_UTF8_single>
     <text_encoding_UTF_to_UTF8>
       <encoding>UTF7</encoding>
     </text_encoding_UTF_to_UTF8>

diff --git a/tests_text_encoding.cpp b/tests_text_encoding.cpp
@@ -344,7 +344,7 @@ TEST_F(TestTextEncoding, text_encoding_UTF16LE_from_code_page)
 	buffer_release(output);
 }
 
-TEST(TestTextEncoding_, DISABLED_text_encoding_encode_decode_UTF8)
+TEST(TestTextEncoding_, text_encoding_encode_decode_UTF8)
 {
 	uint8_t output[6];
 
@@ -358,15 +358,22 @@ TEST(TestTextEncoding_, DISABLED_text_encoding_encode_decode_UTF8)
 		{
 			const auto stored_o = text_encoding_decode_UTF8_single(
 									  output, output + stored, &o);
-			ASSERT_EQ(stored, stored_o);
-			ASSERT_EQ(i, o);
+			ASSERT_EQ(stored, stored_o) << i;
+			ASSERT_EQ(i, o) << i;
 		}
 		else
 		{
-			ASSERT_EQ(3, stored);
-			ASSERT_EQ(0xEF, output[0]);
-			ASSERT_EQ(0xBF, output[1]);
-			ASSERT_EQ(0xBD, output[2]);
+			if (0xEF < output[0])
+			{
+				ASSERT_EQ(4, stored) << i;
+			}
+			else
+			{
+				ASSERT_EQ(3, stored) << i;
+				ASSERT_EQ(0xEF, output[0]) << i;
+				ASSERT_EQ(0xBF, output[1]) << i;
+				ASSERT_EQ(0xBD, output[2]) << i;
+			}
 		}
 	}
 }
@@ -474,7 +481,7 @@ TEST_F(TestTextEncoding, text_encoding_decode_UTF8_single)
 	}
 }
 
-TEST(TestTextEncoding_, DISABLED_text_encoding_encode_UTF8)
+TEST(TestTextEncoding_, text_encoding_encode_UTF8)
 {
 	std::string input_buffer(buffer_size_of(), 0);
 	auto input = reinterpret_cast<void*>(&input_buffer[0]);

diff --git a/text_encoding.c b/text_encoding.c
@@ -772,6 +772,11 @@ uint8_t text_encoding_is_valid_octet_(uint8_t input)
 	return 0x9F < input && input < 0xC0;
 }
 
+uint8_t text_encoding_is_valid_octet(uint8_t input)
+{
+	return 0x7F < input && input < 0xC0;
+}
+
 uint8_t text_encoding_decode_UTF8_single(
 	const uint8_t* input_start, const uint8_t* input_finish, uint32_t* output)
 {
@@ -783,7 +788,82 @@ uint8_t text_encoding_decode_UTF8_single(
 		return 0;
 	}
 
+#ifdef TODO
+	uint8_t input[5];
+	input[0] = *input_start;
 	*output = UTF16LE_UNKNOWN_CHAR;
+
+	if (input[0] < 0x80)
+	{
+		*output = input[0];
+		return 1;
+	}
+	else if (input[0] < 0xC0 || 0xF7 < input[0])
+	{
+		return 1;
+	}
+
+	if (0xEF < input[0])
+	{
+		input[4] = 4;
+	}
+	else if (0xDF < input[0])
+	{
+		input[4] = 3;
+	}
+	else if (0xBF < input[0])
+	{
+		input[4] = 2;
+	}
+	else
+	{
+		return 1;
+	}
+
+	input[1] = (uint8_t)(input_finish - input_start);
+
+	if (input[1] < input[4])
+	{
+		return 1;
+	}
+
+	const uint8_t* start = input_start;
+	const uint8_t* finish = input_start + input[4];
+
+	while (++start < finish)
+	{
+		if (!text_encoding_is_valid_octet(*start))
+		{
+			return (uint8_t)(start - input_start);
+		}
+
+		input[start - input_start] = *start;
+	}
+
+	input[0] <<= input[4];
+	input[0] >>= input[4];
+	/*
+	--count;*/
+
+	while (start < input_finish)
+	{
+		if (!text_encoding_is_valid_octet_(*start))
+		{
+			break;
+		}
+
+		--count;
+	}
+
+	if (0 < count)
+	{
+		*output = UTF16LE_UNKNOWN_CHAR;
+		return start - input_start;
+	}
+
+	return 1;
+#else
+	* output = UTF16LE_UNKNOWN_CHAR;
 	const uint8_t octet_1 = *input_start;
 
 	if (octet_1 < 0x80)
@@ -796,14 +876,22 @@ uint8_t text_encoding_decode_UTF8_single(
 		return 1;
 	}
 
-	uint8_t octet_2 = 0;
-	uint8_t octet_3 = 0;
-	uint8_t* octets[2];
+	uint8_t octet_2, octet_3, octet_4, count = 0, max_count;
+	uint8_t* octets[3];
 	octets[0] = &octet_2;
 	octets[1] = &octet_3;
-	uint8_t count = 0;
+	octets[2] = &octet_4;
+
+	if (0xEF < octet_1)
+	{
+		max_count = 3;
+	}
+	else
+	{
+		max_count = 2;
+	}
 
-	while (++input_start < input_finish && count < 2)
+	while (++input_start < input_finish && count < max_count)
 	{
 		const uint8_t input_code = *input_start;
 
@@ -815,7 +903,21 @@ uint8_t text_encoding_decode_UTF8_single(
 		*(octets[count++]) = input_code;
 	}
 
-	if (2 == count && (0xDF < octet_1 && octet_1 < 0xF0))
+	if (3 == count && (0xEF < octet_1 && octet_1 < 0xF8))
+	{
+		if (!text_encoding_is_valid_octet(octet_2) ||
+			!text_encoding_is_valid_octet(octet_3) ||
+			!text_encoding_is_valid_octet(octet_4))
+		{
+			return count;
+		}
+
+		*output = 0x40000 * (octet_1 & 0x07);
+		*output += 0x1000 * (octet_2 & 0x3F);
+		*output += 0x40 * (octet_3 & 0x3F);
+		*output += octet_4 & 0x3F;
+	}
+	else if (2 == count && (0xDF < octet_1 && octet_1 < 0xF0))
 	{
 		if (octet_1 < 0xE1 && !text_encoding_is_valid_octet_(octet_2))
 		{
@@ -833,6 +935,7 @@ uint8_t text_encoding_decode_UTF8_single(
 	}
 
 	return 1 + count;
+#endif
 }
 
 uint8_t text_encoding_decode_UTF8(