Skip to content

Commit

Permalink
Define a IsValidUtf8 equivalent with unit tests (#30386)
Browse files Browse the repository at this point in the history
* Add a completely untested (but compilable) utf8 tester

* Start adding unit tests

* More test cases

* tests

* More tests

* Fix typo

* Slightly better formatting on test

* Add comment about embedded zeroes

* Restyle

* Allow embedded zeroes (even if it pains me to do so)

* Update copyright year and added 2 more tests

* Restyle

* Some comments and added another set of tests

* Use fromCharString

* Typo fixes

* Fix dates for copyrights

* Restyle
  • Loading branch information
andy31415 authored Nov 9, 2023
1 parent 491354a commit b0f5983
Show file tree
Hide file tree
Showing 5 changed files with 414 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/lib/support/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ static_library("support") {
"logging/BinaryLogging.cpp",
"logging/BinaryLogging.h",
"logging/CHIPLogging.h",
"utf8.cpp",
"utf8.h",
"verhoeff/Verhoeff.cpp",
"verhoeff/Verhoeff.h",
"verhoeff/Verhoeff10.cpp",
Expand Down
1 change: 1 addition & 0 deletions src/lib/support/tests/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ chip_test_suite_using_nltest("tests") {
"TestTimeUtils.cpp",
"TestTlvJson.cpp",
"TestTlvToJson.cpp",
"TestUtf8.cpp",
"TestVariant.cpp",
"TestZclString.cpp",
]
Expand Down
170 changes: 170 additions & 0 deletions src/lib/support/tests/TestUtf8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@

/*
*
* Copyright (c) 2023 Project CHIP Authors
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <functional>

#include <lib/support/UnitTestRegistration.h>
#include <lib/support/utf8.h>

#include <nlunit-test.h>

namespace {

using namespace chip;

void TestValidStrings(nlTestSuite * inSuite, void * inContext)
{
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok

NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("abc")));

// Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html

// Generic UTF8
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("κόσμε")));

// First possible sequence of a certain length
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("€")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("𐀀")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));

// Last possible sequence of a certain length
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("߿")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("￿")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("�����")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("������")));

// Other boundary conditions
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("􏿿")));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan::fromCharString("����")));

// NOTE: UTF8 allows embeded NULLs
// even though strings like that are probably not ideal for handling
// Test that we allow this, but consider later to disallow them
// completely if the spec is updated as such
{
char zero[16] = { 0 };
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 0)));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 1)));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 2)));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 3)));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 4)));
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(zero, 16)));
}

{
char insideZero[] = "test\0zero";
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan(insideZero)));
}
}

#define TEST_INVALID_BYTES(...) \
{ \
uint8_t _buff[] = { __VA_ARGS__ }; \
CharSpan _span(reinterpret_cast<const char *>(_buff), sizeof(_buff)); \
NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span)); \
} \
(void) 0

void TestInvalidStrings(nlTestSuite * inSuite, void * inContext)
{
// overly long representation
TEST_INVALID_BYTES(0xe0, 0b1001'1111, 0x80); // A
TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B
TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C

// Outside codepoint
TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D
TEST_INVALID_BYTES(0xf4, 0x91, 0x82, 0x83);
TEST_INVALID_BYTES(0xf5, 0x81, 0x82, 0x83);

// Missing continuation
TEST_INVALID_BYTES(0xC2);
TEST_INVALID_BYTES(0xE0);
TEST_INVALID_BYTES(0xE1);
TEST_INVALID_BYTES(0xE1, 0x9F);
TEST_INVALID_BYTES(0xED, 0x9F);
TEST_INVALID_BYTES(0xEE, 0x9F);
TEST_INVALID_BYTES(0xF0);
TEST_INVALID_BYTES(0xF0, 0x9F);
TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F);
TEST_INVALID_BYTES(0xF1);
TEST_INVALID_BYTES(0xF1, 0x9F);
TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F);
TEST_INVALID_BYTES(0xF4);
TEST_INVALID_BYTES(0xF4, 0x9F);
TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F);

// More tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
TEST_INVALID_BYTES(0x80); // First continuation byte
TEST_INVALID_BYTES(0xBF); // Last continuation byte

// Impossible bytes
TEST_INVALID_BYTES(0xFE);
TEST_INVALID_BYTES(0xFF);
TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF);

// Overlong sequences
// 4.1 Examples of an overlong ASCII character (in w3c tests)
TEST_INVALID_BYTES(0xc0, 0xaf);
TEST_INVALID_BYTES(0xe0, 0x80, 0xaf);
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf);
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf);
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf);
// 4.2 Maximum overlong sequences (in w3c tests)
TEST_INVALID_BYTES(0xc1, 0xbf);
TEST_INVALID_BYTES(0xe0, 0x9f, 0xbf);
TEST_INVALID_BYTES(0xf0, 0x8f, 0xbf, 0xbf);
TEST_INVALID_BYTES(0xf8, 0x87, 0xbf, 0xbf, 0xbf);
TEST_INVALID_BYTES(0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf);
// 4.3 Overlong representation of the NUL character (in w3c tests)
TEST_INVALID_BYTES(0xc0, 0x80);
TEST_INVALID_BYTES(0xe0, 0x80, 0x80);
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80);
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80);
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80);
}

} // namespace

// clang-format off
const nlTest sTests[] =
{
NL_TEST_DEF("TestValidStrings", TestValidStrings),
NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings),
NL_TEST_SENTINEL()
};
// clang-format on

int TestUtf8()
{
nlTestSuite theSuite = { "CHIP UTF8 tests", &sTests[0], nullptr, nullptr };
nlTestRunner(&theSuite, nullptr);
return nlTestRunnerStats(&theSuite);
}

CHIP_REGISTER_TEST_SUITE(TestUtf8);
194 changes: 194 additions & 0 deletions src/lib/support/utf8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
/*
*
* Copyright (c) 2023 Project CHIP Authors
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utf8.h"

namespace chip {
namespace Utf8 {

namespace {
/**
State machine for UTF8 valid bytes
Table 3-7. Well-Formed UTF-8 Byte Sequences
Code Points | First B | Second B | Third B | Fourth B
------------------+----------+------------+---------+---------
U+0000..U+007F | 00..7F | | |
U+0080..U+07FF | C2..DF | 80..BF | |
U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF |
U+1000..U+CFFF | E1..EC | 80..BF | 80..BF |
U+D000..U+D7FF | ED | 80..9F (B) | 80..BF |
U+E000..U+FFFF | EE..EF | 80..BF | 80..BF |
U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF
U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF
*/

enum class ParserState
{
kFirstByte,
kSecondByte_A,
kSecondByte_B,
kSecondByte_C,
kSecondByte_D,
kExtraOneByte, // 0x80 .. 0xBF once
kExtraTwoBytes, // 0x80 .. 0xBF twice
kExtraThreeBytes, // 0x80 .. 0xBF three times
//
kInvalid, // some error
};

ParserState NextState(ParserState state, uint8_t value)
{
switch (state)
{
case ParserState::kFirstByte:
if (value <= 0x7F)
{
return ParserState::kFirstByte;
}
else if ((value >= 0xC2) && (value <= 0xDF))
{
return ParserState::kExtraOneByte;
}
else if (value == 0xE0)
{
return ParserState::kSecondByte_A;
}
else if ((value >= 0xE1) && (value <= 0xEC))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xED)
{
return ParserState::kSecondByte_B;
}
else if ((value >= 0xEE) && (value <= 0xEF))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xF0)
{
return ParserState::kSecondByte_C;
}
else if ((value >= 0xF1) && (value <= 0xF3))
{
return ParserState::kExtraThreeBytes;
}
else if (value == 0xF4)
{
return ParserState::kSecondByte_D;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_A:
if (value >= 0xA0 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_B:
if (value >= 0x80 && value <= 0x9F)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_C:
if (value >= 0x90 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_D:
if (value >= 0x80 && value <= 0x8F)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraOneByte:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kFirstByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraTwoBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraThreeBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
default:
return ParserState::kInvalid;
}
}

} // namespace

bool IsValid(CharSpan span)
{
ParserState state = ParserState::kFirstByte;

const char * data = span.data();
const size_t kLength = span.size();

// Every byte should be valid
for (size_t i = 0; i < kLength; i++)
{
state = NextState(state, static_cast<uint8_t>(data[i]));

if (state == ParserState::kInvalid)
{
return false;
}
}

// finally no continuation should be expected
return state == ParserState::kFirstByte;
}

} // namespace Utf8
} // namespace chip
Loading

0 comments on commit b0f5983

Please sign in to comment.