Define a IsValidUtf8 equivalent with unit tests (#30386)

* Add a completely untested (but compilable) utf8 tester * Start adding unit tests * More test cases * tests * More tests * Fix typo * Slightly better formatting on test * Add comment about embedded zeroes * Restyle * Allow embedded zeroes (even if it pains me to do so) * Update copyright year and added 2 more tests * Restyle * Some comments and added another set of tests * Use fromCharString * Typo fixes * Fix dates for copyrights * Restyle
project-chip · Nov 9, 2023 · b0f5983 · b0f5983
1 parent 491354a
commit b0f5983
Show file tree

Hide file tree

Showing 5 changed files with 414 additions and 0 deletions.
diff --git a/src/lib/support/BUILD.gn b/src/lib/support/BUILD.gn
@@ -214,6 +214,8 @@ static_library("support") {
  "logging/BinaryLogging.cpp",
  "logging/BinaryLogging.h",
  "logging/CHIPLogging.h",
+ "utf8.cpp",
+ "utf8.h",
  "verhoeff/Verhoeff.cpp",
  "verhoeff/Verhoeff.h",
  "verhoeff/Verhoeff10.cpp",

diff --git a/src/lib/support/tests/BUILD.gn b/src/lib/support/tests/BUILD.gn
@@ -55,6 +55,7 @@ chip_test_suite_using_nltest("tests") {
  "TestTimeUtils.cpp",
  "TestTlvJson.cpp",
  "TestTlvToJson.cpp",
+ "TestUtf8.cpp",
  "TestVariant.cpp",
  "TestZclString.cpp",
  ]

diff --git a/src/lib/support/tests/TestUtf8.cpp b/src/lib/support/tests/TestUtf8.cpp
@@ -0,0 +1,170 @@
+
+/*
+ *
+ * Copyright (c) 2023 Project CHIP Authors
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <functional>
+
+#include <lib/support/UnitTestRegistration.h>
+#include <lib/support/utf8.h>
+
+#include <nlunit-test.h>
+
+namespace {
+
+using namespace chip;
+
+void TestValidStrings(nlTestSuite * inSuite, void * inContext)
+{
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok
+
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("abc")));
+
+ // Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
+
+ // Generic UTF8
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("κόσμε")));
+
+ // First possible sequence of a certain length
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("ࠀ")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("𐀀")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));
+
+ // Last possible sequence of a certain length
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("߿")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));
+
+ // Other boundary conditions
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("퟿")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("􏿿")));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));
+
+ // NOTE: UTF8 allows embeded NULLs
+ // even though strings like that are probably not ideal for handling
+ // Test that we allow this, but consider later to disallow them
+ // completely if the spec is updated as such
+ {
+ char zero[16] = { 0 };
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 0)));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 1)));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 2)));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 3)));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 4)));
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 16)));
+ }
+
+ {
+ char insideZero[] = "test\0zero";
+ NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(insideZero)));
+ }
+}
+
+#define TEST_INVALID_BYTES(...) \
+ { \
+ uint8_t _buff[] = { __VA_ARGS__ }; \
+ CharSpan _span(reinterpret_cast<const char *>(_buff), sizeof(_buff)); \
+ NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span)); \
+ } \
+ (void) 0
+
+void TestInvalidStrings(nlTestSuite * inSuite, void * inContext)
+{
+ // overly long representation
+ TEST_INVALID_BYTES(0xe0, 0b1001'1111, 0x80); // A
+ TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B
+ TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C
+
+ // Outside codepoint
+ TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D
+ TEST_INVALID_BYTES(0xf4, 0x91, 0x82, 0x83);
+ TEST_INVALID_BYTES(0xf5, 0x81, 0x82, 0x83);
+
+ // Missing continuation
+ TEST_INVALID_BYTES(0xC2);
+ TEST_INVALID_BYTES(0xE0);
+ TEST_INVALID_BYTES(0xE1);
+ TEST_INVALID_BYTES(0xE1, 0x9F);
+ TEST_INVALID_BYTES(0xED, 0x9F);
+ TEST_INVALID_BYTES(0xEE, 0x9F);
+ TEST_INVALID_BYTES(0xF0);
+ TEST_INVALID_BYTES(0xF0, 0x9F);
+ TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F);
+ TEST_INVALID_BYTES(0xF1);
+ TEST_INVALID_BYTES(0xF1, 0x9F);
+ TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F);
+ TEST_INVALID_BYTES(0xF4);
+ TEST_INVALID_BYTES(0xF4, 0x9F);
+ TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F);
+
+ // More tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
+ TEST_INVALID_BYTES(0x80); // First continuation byte
+ TEST_INVALID_BYTES(0xBF); // Last continuation byte
+
+ // Impossible bytes
+ TEST_INVALID_BYTES(0xFE);
+ TEST_INVALID_BYTES(0xFF);
+ TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF);
+
+ // Overlong sequences
+ // 4.1 Examples of an overlong ASCII character (in w3c tests)
+ TEST_INVALID_BYTES(0xc0, 0xaf);
+ TEST_INVALID_BYTES(0xe0, 0x80, 0xaf);
+ TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf);
+ TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf);
+ TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf);
+ // 4.2 Maximum overlong sequences (in w3c tests)
+ TEST_INVALID_BYTES(0xc1, 0xbf);
+ TEST_INVALID_BYTES(0xe0, 0x9f, 0xbf);
+ TEST_INVALID_BYTES(0xf0, 0x8f, 0xbf, 0xbf);
+ TEST_INVALID_BYTES(0xf8, 0x87, 0xbf, 0xbf, 0xbf);
+ TEST_INVALID_BYTES(0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf);
+ // 4.3 Overlong representation of the NUL character (in w3c tests)
+ TEST_INVALID_BYTES(0xc0, 0x80);
+ TEST_INVALID_BYTES(0xe0, 0x80, 0x80);
+ TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80);
+ TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80);
+ TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80);
+}
+
+} // namespace
+
+// clang-format off
+const nlTest sTests[] =
+{
+ NL_TEST_DEF("TestValidStrings", TestValidStrings),
+ NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings),
+ NL_TEST_SENTINEL()
+};
+// clang-format on
+
+int TestUtf8()
+{
+ nlTestSuite theSuite = { "CHIP UTF8 tests", &sTests[0], nullptr, nullptr };
+ nlTestRunner(&theSuite, nullptr);
+ return nlTestRunnerStats(&theSuite);
+}
+
+CHIP_REGISTER_TEST_SUITE(TestUtf8);
diff --git a/src/lib/support/utf8.cpp b/src/lib/support/utf8.cpp
@@ -0,0 +1,194 @@
+/*
+ *
+ * Copyright (c) 2023 Project CHIP Authors
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "utf8.h"
+
+namespace chip {
+namespace Utf8 {
+
+namespace {
+/**
+ State machine for UTF8 valid bytes
+
+Table 3-7. Well-Formed UTF-8 Byte Sequences
+
+Code Points | First B | Second B | Third B | Fourth B
+------------------+----------+------------+---------+---------
+U+0000..U+007F | 00..7F | | |
+U+0080..U+07FF | C2..DF | 80..BF | |
+U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF |
+U+1000..U+CFFF | E1..EC | 80..BF | 80..BF |
+U+D000..U+D7FF | ED | 80..9F (B) | 80..BF |
+U+E000..U+FFFF | EE..EF | 80..BF | 80..BF |
+U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF
+U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
+U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF
+*/
+
+enum class ParserState
+{
+ kFirstByte,
+ kSecondByte_A,
+ kSecondByte_B,
+ kSecondByte_C,
+ kSecondByte_D,
+ kExtraOneByte, // 0x80 .. 0xBF once
+ kExtraTwoBytes, // 0x80 .. 0xBF twice
+ kExtraThreeBytes, // 0x80 .. 0xBF three times
+ //
+ kInvalid, // some error
+};
+
+ParserState NextState(ParserState state, uint8_t value)
+{
+ switch (state)
+ {
+ case ParserState::kFirstByte:
+ if (value <= 0x7F)
+ {
+ return ParserState::kFirstByte;
+ }
+ else if ((value >= 0xC2) && (value <= 0xDF))
+ {
+ return ParserState::kExtraOneByte;
+ }
+ else if (value == 0xE0)
+ {
+ return ParserState::kSecondByte_A;
+ }
+ else if ((value >= 0xE1) && (value <= 0xEC))
+ {
+ return ParserState::kExtraTwoBytes;
+ }
+ else if (value == 0xED)
+ {
+ return ParserState::kSecondByte_B;
+ }
+ else if ((value >= 0xEE) && (value <= 0xEF))
+ {
+ return ParserState::kExtraTwoBytes;
+ }
+ else if (value == 0xF0)
+ {
+ return ParserState::kSecondByte_C;
+ }
+ else if ((value >= 0xF1) && (value <= 0xF3))
+ {
+ return ParserState::kExtraThreeBytes;
+ }
+ else if (value == 0xF4)
+ {
+ return ParserState::kSecondByte_D;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kSecondByte_A:
+ if (value >= 0xA0 && value <= 0xBF)
+ {
+ return ParserState::kExtraOneByte;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kSecondByte_B:
+ if (value >= 0x80 && value <= 0x9F)
+ {
+ return ParserState::kExtraOneByte;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kSecondByte_C:
+ if (value >= 0x90 && value <= 0xBF)
+ {
+ return ParserState::kExtraTwoBytes;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kSecondByte_D:
+ if (value >= 0x80 && value <= 0x8F)
+ {
+ return ParserState::kExtraTwoBytes;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kExtraOneByte:
+ if (value >= 0x80 && value <= 0xBF)
+ {
+ return ParserState::kFirstByte;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kExtraTwoBytes:
+ if (value >= 0x80 && value <= 0xBF)
+ {
+ return ParserState::kExtraOneByte;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ case ParserState::kExtraThreeBytes:
+ if (value >= 0x80 && value <= 0xBF)
+ {
+ return ParserState::kExtraTwoBytes;
+ }
+ else
+ {
+ return ParserState::kInvalid;
+ }
+ default:
+ return ParserState::kInvalid;
+ }
+}
+
+} // namespace
+
+bool IsValid(CharSpan span)
+{
+ ParserState state = ParserState::kFirstByte;
+
+ const char * data = span.data();
+ const size_t kLength = span.size();
+
+ // Every byte should be valid
+ for (size_t i = 0; i < kLength; i++)
+ {
+ state = NextState(state, static_cast<uint8_t>(data[i]));
+
+ if (state == ParserState::kInvalid)
+ {
+ return false;
+ }
+ }
+
+ // finally no continuation should be expected
+ return state == ParserState::kFirstByte;
+}
+
+} // namespace Utf8
+} // namespace chip