Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define a IsValidUtf8 equivalent with unit tests #30386

Merged
merged 17 commits into from
Nov 9, 2023
2 changes: 2 additions & 0 deletions src/lib/support/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ static_library("support") {
"logging/BinaryLogging.cpp",
"logging/BinaryLogging.h",
"logging/CHIPLogging.h",
"utf8.cpp",
"utf8.h",
"verhoeff/Verhoeff.cpp",
"verhoeff/Verhoeff.h",
"verhoeff/Verhoeff10.cpp",
Expand Down
1 change: 1 addition & 0 deletions src/lib/support/tests/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ chip_test_suite_using_nltest("tests") {
"TestTimeUtils.cpp",
"TestTlvJson.cpp",
"TestTlvToJson.cpp",
"TestUtf8.cpp",
"TestVariant.cpp",
"TestZclString.cpp",
]
Expand Down
151 changes: 151 additions & 0 deletions src/lib/support/tests/TestUtf8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@

/*
*
* Copyright (c) 2021 Project CHIP Authors
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <functional>

#include <lib/support/UnitTestRegistration.h>
#include <lib/support/utf8.h>

#include <nlunit-test.h>

namespace {

using namespace chip;

bool IsValidCStringAsUtf8(const char * s)
{
return Utf8::IsValid(CharSpan(s, strlen(s)));
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
}

void TestValidStrings(nlTestSuite * inSuite, void * inContext)
{
NL_TEST_ASSERT(inSuite, Utf8::IsValid(CharSpan())); // empty span ok

NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8(""));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("abc"));

// Various tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html

// Generic UTF8
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("κόσμε"));

// First possible sequence of a certain length
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("€"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("ࠀ"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("𐀀"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("�����"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("������"));

// Last possible sequence of a certain length
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8(""));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("߿"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("￿"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("����"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("�����"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("������"));

// Other boundary conditions
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("퟿"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8(""));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("�"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("􏿿"));
NL_TEST_ASSERT(inSuite, IsValidCStringAsUtf8("����"));
}

#define TEST_INVALID_BYTES(...) \
{ \
uint8_t _buff[] = { __VA_ARGS__ }; \
CharSpan _span(reinterpret_cast<const char *>(_buff), sizeof(_buff)); \
NL_TEST_ASSERT(inSuite, !Utf8::IsValid(_span)); \
} \
(void) 0

void TestInvalidStrings(nlTestSuite * inSuite, void * inContext)
{
// cannot embed zeroes
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
TEST_INVALID_BYTES(0x00);

// overly long representation
TEST_INVALID_BYTES(0xe0, 0x09, 0x80); // A
TEST_INVALID_BYTES(0xed, 0b1011'0000, 0x80); // B
TEST_INVALID_BYTES(0xf0, 0b1000'1111, 0x80); // C

// Ourside codepoint
TEST_INVALID_BYTES(0xf4, 0x90, 0x80, 0x80); // D

// Missing continuation
TEST_INVALID_BYTES(0xC2);
TEST_INVALID_BYTES(0xE0);
TEST_INVALID_BYTES(0xE1);
TEST_INVALID_BYTES(0xE1, 0x9F);
TEST_INVALID_BYTES(0xED, 0x9F);
TEST_INVALID_BYTES(0xEE, 0x9F);
TEST_INVALID_BYTES(0xF0);
TEST_INVALID_BYTES(0xF0, 0x9F);
TEST_INVALID_BYTES(0xF0, 0x9F, 0x9F);
TEST_INVALID_BYTES(0xF1);
TEST_INVALID_BYTES(0xF1, 0x9F);
TEST_INVALID_BYTES(0xF1, 0x9F, 0x9F);
TEST_INVALID_BYTES(0xF4);
TEST_INVALID_BYTES(0xF4, 0x9F);
TEST_INVALID_BYTES(0xF4, 0x9F, 0x9F);

// More tests from https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
TEST_INVALID_BYTES(0x80); // First continuation byte
TEST_INVALID_BYTES(0xBF); // Last continuation byte

// Impossible bytes
TEST_INVALID_BYTES(0xFE);
TEST_INVALID_BYTES(0xFF);
TEST_INVALID_BYTES(0xFE, 0xFE, 0xFF, 0xFF);

// Overlong sequences
TEST_INVALID_BYTES(0xc0, 0xaf);
TEST_INVALID_BYTES(0xe0, 0x80, 0xaf);
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0xaf);
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0xaf);
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf);
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
TEST_INVALID_BYTES(0xc0, 0x80);
TEST_INVALID_BYTES(0xe0, 0x80, 0x80);
TEST_INVALID_BYTES(0xf0, 0x80, 0x80, 0x80);
TEST_INVALID_BYTES(0xf8, 0x80, 0x80, 0x80, 0x80);
TEST_INVALID_BYTES(0xfc, 0x80, 0x80, 0x80, 0x80, 0x80);
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace

// clang-format off
const nlTest sTests[] =
{
NL_TEST_DEF("TestValidStrings", TestValidStrings),
NL_TEST_DEF("TestInvalidStrings", TestInvalidStrings),
NL_TEST_SENTINEL()
};
// clang-format on

int TestUtf8()
{
nlTestSuite theSuite = { "CHIP Variant tests", &sTests[0], nullptr, nullptr };
andy31415 marked this conversation as resolved.
Show resolved Hide resolved

// Run test suit againt one context.
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
nlTestRunner(&theSuite, nullptr);
return nlTestRunnerStats(&theSuite);
}

CHIP_REGISTER_TEST_SUITE(TestUtf8);
200 changes: 200 additions & 0 deletions src/lib/support/utf8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
/*
*
* Copyright (c) 2020-2021 Project CHIP Authors
andy31415 marked this conversation as resolved.
Show resolved Hide resolved
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utf8.h"

namespace chip {
namespace Utf8 {

namespace {
/**
State machine for UTF8 valid bytes

Table 3-7. Well-Formed UTF-8 Byte Sequences

Code Points | First B | Second B | Third B | Fourth B
------------------+----------+------------+---------+---------
U+0000..U+007F | 00..7F | | |
U+0080..U+07FF | C2..DF | 80..BF | |
U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF |
U+1000..U+CFFF | E1..EC | 80..BF | 80..BF |
U+D000..U+D7FF | ED | 80..9F (B) | 80..BF |
U+E000..U+FFFF | EE..EF | 80..BF | 80..BF |
U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF
U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF
*/

enum class ParserState
{
kFirstByte,
kSecondByte_A,
kSecondByte_B,
kSecondByte_C,
kSecondByte_D,
kExtraOneByte, // 0x80 .. 0xBF once
kExtraTwoBytes, // 0x80 .. 0xBF twice
kExtraThreeBytes, // 0x80 .. 0xBF three times
//
kInvalid, // some error
};

ParserState NextState(ParserState state, uint8_t value)
{
if (value == 0)
{
// Refuse to have embedded 0s
return ParserState::kInvalid;
}
andy31415 marked this conversation as resolved.
Show resolved Hide resolved

switch (state)
{
case ParserState::kFirstByte:
if (value <= 0x7F)
{
return ParserState::kFirstByte;
}
else if ((value >= 0xC2) && (value <= 0xDF))
{
return ParserState::kExtraOneByte;
}
else if (value == 0xE0)
{
return ParserState::kSecondByte_A;
}
else if ((value >= 0xE1) && (value <= 0xEC))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xED)
{
return ParserState::kSecondByte_B;
}
else if ((value >= 0xEE) && (value <= 0xEF))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xF0)
{
return ParserState::kSecondByte_C;
}
else if ((value >= 0xF1) && (value <= 0xF3))
{
return ParserState::kExtraThreeBytes;
}
else if (value == 0xF4)
{
return ParserState::kSecondByte_D;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_A:
if (value >= 0xA0 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_B:
if (value >= 0x80 && value <= 0x9F)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_C:
if (value >= 0x90 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_D:
if (value >= 0x80 && value <= 0x8F)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraOneByte:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kFirstByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraTwoBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraThreeBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
default:
return ParserState::kInvalid;
}
}

} // namespace

bool IsValid(CharSpan span)
{
ParserState state = ParserState::kFirstByte;

const char * data = span.data();
const size_t kLength = span.size();

// Every byte should be valid
for (size_t i = 0; i < kLength; i++)
{
state = NextState(state, static_cast<uint8_t>(data[i]));

if (state == ParserState::kInvalid)
{
return false;
}
}

// finally no continuation should be expected
return state == ParserState::kFirstByte;
}

} // namespace Utf8
} // namespace chip
Loading
Loading