Skip to content

Commit

Permalink
LibUnicode+LibWeb: Rework bidirectional_class() API a bit
Browse files Browse the repository at this point in the history
Before this commit, GenerateUnicodeData generated a
`BidirectionalClass` enum directly from UnicodeData.txt.

Now, GenerateUnicodeData instead generates `BidirectionalClassInternal`
and CharacterTypes.h has an explicit enum `BidiClass` with nicer names
-- no underscores, but also names more similar to ICUs names.

(Since CharacterTypes.h is included in some of LibUnicode's generators,
it can't refer to generated `BidirectionalClassInternal`, so we instead
have a big manual mapping switch in UnicodeData.cpp.)

`bidirectional_class()` used to return an `Optional<BidirectionalClass>`
for when unicode data was disabled. Now we return `BidiClass` and
just return `BidiClass::LeftToRight` instead of making every client
do this.

It also updates LibWeb's Element.cpp to this new system.

Also remove now-unused `bidirectional_class_from_string()`.

This kind of cherry-picks the 4th commit of
LadybirdBrowser/ladybird#239 (aa3a30870b58c47cb37bce1418d7e6bee7af71d9):
The CharacterTypes.h, Element.cpp and TestUnicodeCharacterTypes.cpp changes are
by trflynn.

Co-authored-by: Tim Flynn <trflynn89@serenityos.org>
  • Loading branch information
nico and Tim Flynn committed Nov 24, 2024
1 parent aff777c commit 0197bfc
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ namespace Unicode {
generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys());
generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys());
generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags);
generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values(), unicode_data.bidirectional_class_aliases);
generate_enum("BidirectionalClassInternal"sv, {}, unicode_data.bidirectional_classes.values(), unicode_data.bidirectional_class_aliases);

generator.append(R"~~~(
struct SpecialCasing {
Expand Down Expand Up @@ -1038,7 +1038,7 @@ struct CodePointNameComparator : public CodePointRangeComparator {
struct BidiClassData {
CodePointRange code_point_range {};
BidirectionalClass bidi_class {};
BidirectionalClassInternal bidi_class {};
};
struct CodePointBidiClassComparator : public CodePointRangeComparator {
Expand Down Expand Up @@ -1303,7 +1303,7 @@ static constexpr Array<BidiClassData, @size@> s_bidirectional_classes { {
generator.set("first", ByteString::formatted("{:#x}", data.code_point_range.first));
generator.set("last", ByteString::formatted("{:#x}", data.code_point_range.last));
generator.set("bidi_class", data.bidi_class);
generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }");
generator.append("{ { @first@, @last@ }, BidirectionalClassInternal::@bidi_class@ }");

if (bidi_classes_in_current_row == max_bidi_classes_per_row) {
bidi_classes_in_current_row = 0;
Expand Down Expand Up @@ -1445,7 +1445,7 @@ Optional<u32> code_point_composition(u32 first_code_point, u32 second_code_point
return {};
}
Optional<BidirectionalClass> bidirectional_class(u32 code_point)
Optional<BidirectionalClassInternal> bidirectional_class_internal(u32 code_point)
{
if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {}))
return entry->bidi_class;
Expand Down Expand Up @@ -1513,8 +1513,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@)
TRY(append_prop_search("WordBreakProperty"sv, "word_break_property"sv, "s_word_break_properties"sv));
TRY(append_prop_search("SentenceBreakProperty"sv, "sentence_break_property"sv, "s_sentence_break_properties"sv));

TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {}));

generator.append(R"~~~(
}
)~~~");
Expand Down
24 changes: 6 additions & 18 deletions Tests/LibUnicode/TestUnicodeCharacterTypes.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
Expand Down Expand Up @@ -416,25 +416,13 @@ TEST_CASE(code_point_display_name)

TEST_CASE(code_point_bidirectional_character_type)
{
auto code_point_bidi_class = [](u32 code_point) {
auto bidi_class = Unicode::bidirectional_class(code_point);
VERIFY(bidi_class.has_value());
return bidi_class.release_value();
};

auto bidi_class_from_string = [](StringView name) {
auto result = Unicode::bidirectional_class_from_string(name);
VERIFY(result.has_value());
return result.release_value();
};

// Left-to-right
EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv));
EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv));
EXPECT_EQ(Unicode::bidirectional_class('A'), Unicode::BidiClass::LeftToRight);
EXPECT_EQ(Unicode::bidirectional_class('z'), Unicode::BidiClass::LeftToRight);
// European number
EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv));
EXPECT_EQ(Unicode::bidirectional_class('7'), Unicode::BidiClass::EuropeanNumber);
// Whitespace
EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv));
EXPECT_EQ(Unicode::bidirectional_class(' '), Unicode::BidiClass::WhiteSpaceNeutral);
// Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv));
EXPECT_EQ(Unicode::bidirectional_class(0xFEB4), Unicode::BidiClass::RightToLeftArabic);
}
68 changes: 66 additions & 2 deletions Userland/Libraries/LibUnicode/CharacterTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,71 @@ bool __attribute__((weak)) code_point_has_grapheme_break_property(u32, GraphemeB
bool __attribute__((weak)) code_point_has_word_break_property(u32, WordBreakProperty) { return {}; }
bool __attribute__((weak)) code_point_has_sentence_break_property(u32, SentenceBreakProperty) { return {}; }

Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; }
Optional<BidirectionalClass> __attribute__((weak)) bidirectional_class(u32) { return {}; }
Optional<BidirectionalClassInternal> __attribute__((weak)) bidirectional_class_internal(u32) { return {}; }

#if ENABLE_UNICODE_DATA
static constexpr BidiClass bidi_class_from_bidirectional_class_interal(BidirectionalClassInternal direction)
{
switch (direction) {
case BidirectionalClassInternal::Arabic_Number:
return BidiClass::ArabicNumber;
case BidirectionalClassInternal::Paragraph_Separator:
return BidiClass::BlockSeparator;
case BidirectionalClassInternal::Boundary_Neutral:
return BidiClass::BoundaryNeutral;
case BidirectionalClassInternal::Common_Separator:
return BidiClass::CommonNumberSeparator;
case BidirectionalClassInternal::Nonspacing_Mark:
return BidiClass::DirNonSpacingMark;
case BidirectionalClassInternal::European_Number:
return BidiClass::EuropeanNumber;
case BidirectionalClassInternal::European_Separator:
return BidiClass::EuropeanNumberSeparator;
case BidirectionalClassInternal::European_Terminator:
return BidiClass::EuropeanNumberTerminator;
case BidirectionalClassInternal::First_Strong_Isolate:
return BidiClass::FirstStrongIsolate;
case BidirectionalClassInternal::Left_To_Right:
return BidiClass::LeftToRight;
case BidirectionalClassInternal::Left_To_Right_Embedding:
return BidiClass::LeftToRightEmbedding;
case BidirectionalClassInternal::Left_To_Right_Isolate:
return BidiClass::LeftToRightIsolate;
case BidirectionalClassInternal::Left_To_Right_Override:
return BidiClass::LeftToRightOverride;
case BidirectionalClassInternal::Other_Neutral:
return BidiClass::OtherNeutral;
case BidirectionalClassInternal::Pop_Directional_Format:
return BidiClass::PopDirectionalFormat;
case BidirectionalClassInternal::Pop_Directional_Isolate:
return BidiClass::PopDirectionalIsolate;
case BidirectionalClassInternal::Right_To_Left:
return BidiClass::RightToLeft;
case BidirectionalClassInternal::Arabic_Letter:
return BidiClass::RightToLeftArabic;
case BidirectionalClassInternal::Right_To_Left_Embedding:
return BidiClass::RightToLeftEmbedding;
case BidirectionalClassInternal::Right_To_Left_Isolate:
return BidiClass::RightToLeftIsolate;
case BidirectionalClassInternal::Right_To_Left_Override:
return BidiClass::RightToLeftOverride;
case BidirectionalClassInternal::Segment_Separator:
return BidiClass::SegmentSeparator;
case BidirectionalClassInternal::White_Space:
return BidiClass::WhiteSpaceNeutral;
}

VERIFY_NOT_REACHED();
}
#endif

BidiClass bidirectional_class([[maybe_unused]] u32 code_point)
{
#if ENABLE_UNICODE_DATA
if (auto bidi_class_internal = bidirectional_class_internal(code_point); bidi_class_internal.has_value())
return bidi_class_from_bidirectional_class_interal(bidi_class_internal.value());
#endif
return BidiClass::LeftToRight;
}

}
32 changes: 30 additions & 2 deletions Userland/Libraries/LibUnicode/CharacterTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,35 @@ bool code_point_has_grapheme_break_property(u32 code_point, GraphemeBreakPropert
bool code_point_has_word_break_property(u32 code_point, WordBreakProperty property);
bool code_point_has_sentence_break_property(u32 code_point, SentenceBreakProperty property);

Optional<BidirectionalClass> bidirectional_class_from_string(StringView);
Optional<BidirectionalClass> bidirectional_class(u32 code_point);
enum class BidirectionalClassInternal : u8;
Optional<BidirectionalClassInternal> bidirectional_class_internal(u32 code_point);

enum class BidiClass {
ArabicNumber, // AN
BlockSeparator, // B
BoundaryNeutral, // BN
CommonNumberSeparator, // CS
DirNonSpacingMark, // NSM
EuropeanNumber, // EN
EuropeanNumberSeparator, // ES
EuropeanNumberTerminator, // ET
FirstStrongIsolate, // FSI
LeftToRight, // L
LeftToRightEmbedding, // LRE
LeftToRightIsolate, // LRI
LeftToRightOverride, // LRO
OtherNeutral, // ON
PopDirectionalFormat, // PDF
PopDirectionalIsolate, // PDI
RightToLeft, // R
RightToLeftArabic, // AL
RightToLeftEmbedding, // RLE
RightToLeftIsolate, // RLI
RightToLeftOverride, // RLO
SegmentSeparator, // S
WhiteSpaceNeutral, // WS
};

BidiClass bidirectional_class(u32 code_point);

}
2 changes: 1 addition & 1 deletion Userland/Libraries/LibUnicode/Forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace Unicode {

enum class BidirectionalClass : u8;
enum class BidiClass;
enum class Block : u16;
enum class EmojiGroup : u8;
enum class GeneralCategory : u8;
Expand Down
20 changes: 6 additions & 14 deletions Userland/Libraries/LibWeb/DOM/Element.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2575,22 +2575,14 @@ bool Element::is_auto_directionality_form_associated_element() const
// https://html.spec.whatwg.org/multipage/dom.html#auto-directionality
Optional<Element::Directionality> Element::auto_directionality() const
{
static auto bidirectional_class_L = Unicode::bidirectional_class_from_string("L"sv);
static auto bidirectional_class_AL = Unicode::bidirectional_class_from_string("AL"sv);
static auto bidirectional_class_R = Unicode::bidirectional_class_from_string("R"sv);

// AD-HOC: Assume 'ltr' if Unicode data generation is disabled.
if (!bidirectional_class_L.has_value())
return Directionality::Ltr;

// https://html.spec.whatwg.org/multipage/dom.html#text-node-directionality
auto text_node_directionality = [](Text const& text_node) -> Optional<Directionality> {
// 1. If text's data does not contain a code point whose bidirectional character type is L, AL, or R, then return null.
// 2. Let codePoint be the first code point in text's data whose bidirectional character type is L, AL, or R.
Optional<Unicode::BidirectionalClass> found_character_bidi_class;
Optional<Unicode::BidiClass> found_character_bidi_class;
for (auto code_point : Utf8View(text_node.data())) {
auto bidi_class = Unicode::bidirectional_class(code_point);
if (first_is_one_of(bidi_class, bidirectional_class_L, bidirectional_class_AL, bidirectional_class_R)) {
if (first_is_one_of(bidi_class, Unicode::BidiClass::LeftToRight, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft)) {
found_character_bidi_class = bidi_class;
break;
}
Expand All @@ -2599,12 +2591,12 @@ Optional<Element::Directionality> Element::auto_directionality() const
return {};

// 3. If codePoint is of bidirectional character type AL or R, then return 'rtl'.
if (first_is_one_of(*found_character_bidi_class, bidirectional_class_AL, bidirectional_class_R))
if (first_is_one_of(*found_character_bidi_class, Unicode::BidiClass::RightToLeftArabic, Unicode::BidiClass::RightToLeft))
return Directionality::Rtl;

// 4. If codePoint is of bidirectional character type L, then return 'ltr'.
// NOTE: codePoint should always be of bidirectional character type L by this point, so we can just return 'ltr' here.
VERIFY(*found_character_bidi_class == bidirectional_class_L);
VERIFY(*found_character_bidi_class == Unicode::BidiClass::LeftToRight);
return Directionality::Ltr;
};

Expand All @@ -2618,9 +2610,9 @@ Optional<Element::Directionality> Element::auto_directionality() const
// and there is no character of bidirectional character type L anywhere before it in the element's value, then return 'rtl'.
for (auto code_point : Utf8View(value)) {
auto bidi_class = Unicode::bidirectional_class(code_point);
if (bidi_class == bidirectional_class_L)
if (bidi_class == Unicode::BidiClass::LeftToRight)
break;
if (bidi_class == bidirectional_class_AL || bidi_class == bidirectional_class_R)
if (bidi_class == Unicode::BidiClass::RightToLeftArabic || bidi_class == Unicode::BidiClass::RightToLeft)
return Directionality::Rtl;
}

Expand Down

0 comments on commit 0197bfc

Please sign in to comment.