Skip to content

Commit

Permalink
ICU-22986 GL takes CM
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Dec 20, 2024
1 parent e3bc073 commit 7d60bb8
Show file tree
Hide file tree
Showing 34 changed files with 81 additions and 39 deletions.
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_loose.txt
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_normal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/brkitr/rules/line_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:
Expand Down
66 changes: 50 additions & 16 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1705,7 +1705,34 @@ class RemapRule : public SegmentationRule {
resolved[i].appliedRule = this;
resolved[i].indexInRemapped.reset();
}
// While replacing, we need to check that we are not creating
// surrogate pairs. Since appendReplacement performs two
// concatenations (the unreplaced segment and the replacement), we
// need to check in two places: whether the unreplaced segment
// starts with a trailing surrogate that ends up after a leading
// surrogate, and whether the replaced segment starts with a leading
// surrogate that ends up after a trailing surrogate.
// We break the pair by replacing one of the surrogates with U+FFFF,
// which has the same properties for all but line breaking, and the
// same behaviour in line breaking (lb=SG and lb=XX are both treated
// as lb=AL).
std::optional<int32_t> trailingLead;
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
trailingLead = result.length() - 1;
}

matcher->appendReplacement(result, replacement_, status);

if (trailingLead && *trailingLead + 1 < result.length() &&
U16_IS_TRAIL(result[*trailingLead + 1])) {
result.setCharAt(*trailingLead, u'\uFFFF');
}

if (matcher->start(status) + offset > 0 &&
U16_IS_LEAD(result[matcher->start(status) + offset - 1]) &&
U16_IS_TRAIL(result[matcher->start(status) + offset])) {
result.setCharAt(matcher->start(status) + offset, u'\uFFFF');
}
offset = result.length() - *resolved[i].indexInRemapped;
}
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
Expand All @@ -1714,7 +1741,17 @@ class RemapRule : public SegmentationRule {
}
*resolved[i].indexInRemapped += offset;
}

std::optional<int32_t> trailingLead;
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
trailingLead = result.length() - 1;
}
matcher->appendTail(result);
if (trailingLead && *trailingLead + 1 < result.length() &&
U16_IS_TRAIL(result[*trailingLead + 1])) {
result.setCharAt(*trailingLead, u'\uFFFF');
}

if (resolved.back().indexInRemapped != result.length()) {
std::string indices;
for (const auto r : resolved) {
Expand Down Expand Up @@ -2906,20 +2943,11 @@ RBBILineMonkey::RBBILineMonkey() :

std::list<std::pair<std::string, UnicodeSet>> partition;

// TODO(egg): The following two workarounds for what seems to be ICU bugs;
// with UREGEX_DOTALL (but not UREGEX_MULTILINE):
// 1. /.*\u000A/ does not match CR LF;
// 2. /$/ matches ( BK | CR | LF | NL ) eot.
rules.push_back(std::make_unique<RegexRule>(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()"));
rules.push_back(std::make_unique<RegexRule>(
uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)",
uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])",
u'×',
uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)"));

rules.push_back(std::make_unique<RegexRule>(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()"));
// This one could be part of the rules.
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)"));
// Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
// The generated rules use the same (?!.).
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))"));

// --- NOLI ME TANGERE ---
// Generated by GenerateBreakTest.java in the Unicode tools.
Expand Down Expand Up @@ -3015,7 +3043,7 @@ RBBILineMonkey::RBBILineMonkey() :
rules.push_back(std::make_unique<RegexRule>(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})"));
rules.push_back(std::make_unique<RegexRule>(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))"));
rules.push_back(std::make_unique<RegexRule>(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})"));
Expand All @@ -3027,10 +3055,10 @@ RBBILineMonkey::RBBILineMonkey() :
rules.push_back(std::make_unique<RegexRule>(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))"));
rules.push_back(std::make_unique<RegexRule>(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})"));
rules.push_back(std::make_unique<RegexRule>(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])"));
Expand Down Expand Up @@ -3080,6 +3108,7 @@ RBBILineMonkey::RBBILineMonkey() :
// --- End of generated code. ---



// TODO(egg): This could just as well be part of the rules…
rules.push_back(std::make_unique<RegexRule>(uR"(ALL ÷ / ÷ ALL)",
uR"()", u'÷',
Expand Down Expand Up @@ -3122,7 +3151,12 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
}
for (std::size_t i = 0; i < resolved.size(); ++i) {
if (resolved[i].appliedRule == nullptr) {
printf("Failed to resolve at %zu" , i);
printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i-1), s.char32At(i));
if (resolved[i].indexInRemapped.has_value()) {
printf("which is remapped %d between U+%04X and U+%04X", *resolved[i].indexInRemapped,
remapped.char32At(*resolved[i].indexInRemapped - 1),
remapped.char32At(*resolved[i].indexInRemapped));
}
std::terminate();
} else {
setAppliedRule(i, resolved[i].appliedRule->name().c_str());
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line_loose.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line_normal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/testdata/break_rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
4 changes: 4 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2214,3 +2214,7 @@ Bangkok)•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.

# A hyphen following non-breaking space that carries an intervening combining
# mark is treated as word-initial; by LB20a it has no break opportunity after
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
<data>• ̄-k•</data>
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -1276,7 +1276,6 @@ int next(int startPos) {
fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
Expand All @@ -1285,7 +1284,8 @@ int next(int startPos) {
fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1);
}
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];

# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];

LB12a: [^SP BA HY] CM* GL;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2214,3 +2214,7 @@ Bangkok)•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.

# A hyphen following non-breaking space that carries an intervening combining
# mark is treated as word-initial; by LB20a it has no break opportunity after
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
<data>• ̄-k•</data>

0 comments on commit 7d60bb8

Please sign in to comment.