ICU-22986 GL takes CM

unicode-org · Dec 20, 2024 · 7d60bb8 · 7d60bb8
1 parent e3bc073
commit 7d60bb8
Show file tree

Hide file tree

Showing 34 changed files with 81 additions and 39 deletions.
diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt
@@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus; 
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt
@@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt
@@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
@@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt
@@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
@@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt
@@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 #             and then to default UAX #14 behaviour (UTC-179-C32).
 #
 ^($HY | $HH) $CM* $ALPlus;
-$GL ($HY | $HH) $CM* $ALPlus; 
+$GL $CM* ($HY | $HH) $CM* $ALPlus;
 # Non-breaking CB from LB8a:
 $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
 # Non-breaking SP from LB14:

diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
@@ -1705,7 +1705,34 @@ class RemapRule : public SegmentationRule {
                 resolved[i].appliedRule = this;
                 resolved[i].indexInRemapped.reset();
             }
+            // While replacing, we need to check that we are not creating
+            // surrogate pairs.  Since appendReplacement performs two
+            // concatenations (the unreplaced segment and the replacement), we
+            // need to check in two places: whether the unreplaced segment
+            // starts with a trailing surrogate that ends up after a leading
+            // surrogate, and whether the replaced segment starts with a leading
+            // surrogate that ends up after a trailing surrogate.
+            // We break the pair by replacing one of the surrogates with U+FFFF,
+            // which has the same properties for all but line breaking, and the
+            // same behaviour in line breaking (lb=SG and lb=XX are both treated
+            // as lb=AL).
+            std::optional<int32_t> trailingLead;
+            if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
+                trailingLead = result.length() - 1;
+            }
+
             matcher->appendReplacement(result, replacement_, status);
+
+            if (trailingLead && *trailingLead + 1 < result.length() &&
+                    U16_IS_TRAIL(result[*trailingLead + 1])) {
+                result.setCharAt(*trailingLead, u'\uFFFF');
+            }
+
+            if (matcher->start(status) + offset > 0 &&
+                    U16_IS_LEAD(result[matcher->start(status) + offset - 1]) &&
+                    U16_IS_TRAIL(result[matcher->start(status) + offset])) {
+                result.setCharAt(matcher->start(status) + offset, u'\uFFFF');
+            }
             offset = result.length() - *resolved[i].indexInRemapped;
         }
         for (; i < static_cast<int32_t>(resolved.size()); ++i) {
@@ -1714,7 +1741,17 @@ class RemapRule : public SegmentationRule {
             }
             *resolved[i].indexInRemapped += offset;
         }
+
+        std::optional<int32_t> trailingLead;
+        if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
+            trailingLead = result.length() - 1;
+        }
         matcher->appendTail(result);
+        if (trailingLead && *trailingLead + 1 < result.length() &&
+                U16_IS_TRAIL(result[*trailingLead + 1])) {
+            result.setCharAt(*trailingLead, u'\uFFFF');
+        }
+
         if (resolved.back().indexInRemapped != result.length()) {
             std::string indices;
             for (const auto r : resolved) {
@@ -2906,20 +2943,11 @@ RBBILineMonkey::RBBILineMonkey() :
 
     std::list<std::pair<std::string, UnicodeSet>> partition;
 
-    // TODO(egg): The following two workarounds for what seems to be ICU bugs;
-    // with UREGEX_DOTALL (but not UREGEX_MULTILINE):
-    // 1. /.*\u000A/ does not match CR LF;
-    // 2. /$/ matches ( BK | CR | LF | NL ) eot.
-    rules.push_back(std::make_unique<RegexRule>(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()"));
-    rules.push_back(std::make_unique<RegexRule>(
-        uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)",
-        uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])",
-        u'×',
-        uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)"));
-
     rules.push_back(std::make_unique<RegexRule>(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()"));
     // This one could be part of the rules.
-    rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)"));
+    // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
+    // The generated rules use the same (?!.).
+    rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))"));
 
     // --- NOLI ME TANGERE ---
     // Generated by GenerateBreakTest.java in the Unicode tools.
@@ -3015,7 +3043,7 @@ RBBILineMonkey::RBBILineMonkey() :
     rules.push_back(std::make_unique<RegexRule>(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})"));
     rules.push_back(std::make_unique<RegexRule>(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()"));
-    rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
+    rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))"));
     rules.push_back(std::make_unique<RegexRule>(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})"));
@@ -3027,10 +3055,10 @@ RBBILineMonkey::RBBILineMonkey() :
     rules.push_back(std::make_unique<RegexRule>(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))"));
     rules.push_back(std::make_unique<RegexRule>(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])"));
-    rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()"));
+    rules.push_back(std::make_unique<RegexRule>(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()"));
     rules.push_back(std::make_unique<RegexRule>(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})"));
     rules.push_back(std::make_unique<RegexRule>(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()"));
-    rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
+    rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})"));
     rules.push_back(std::make_unique<RegexRule>(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])"));
@@ -3080,6 +3108,7 @@ RBBILineMonkey::RBBILineMonkey() :
     // --- End of generated code. ---
 
 
+
     // TODO(egg): This could just as well be part of the rules…
     rules.push_back(std::make_unique<RegexRule>(uR"(ALL ÷ / ÷ ALL)",
                                                 uR"()", u'÷',
@@ -3122,7 +3151,12 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
     }
     for (std::size_t i = 0; i < resolved.size(); ++i) {
         if (resolved[i].appliedRule == nullptr) {
-            printf("Failed to resolve at %zu" , i);
+            printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i-1), s.char32At(i));
+            if (resolved[i].indexInRemapped.has_value()) {
+                printf("which is remapped %d between U+%04X and U+%04X", *resolved[i].indexInRemapped,
+                       remapped.char32At(*resolved[i].indexInRemapped - 1),
+                       remapped.char32At(*resolved[i].indexInRemapped));
+            }
             std::terminate();
         } else {
             setAppliedRule(i, resolved[i].appliedRule->name().c_str());

diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt
@@ -176,7 +176,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4c/source/test/testdata/break_rules/line_cj.txt b/icu4c/source/test/testdata/break_rules/line_cj.txt
@@ -180,7 +180,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt
@@ -181,7 +181,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@@ -200,7 +200,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA BAX HY] CM* GL;

diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt
@@ -182,7 +182,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@@ -186,7 +186,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
@@ -2214,3 +2214,7 @@ Bangkok)•</data>
 <data>•« Complex »« chaining » •</data>
 <data>•« .618 »•</data>  # Interaction with the ICU tailoring to break before such numbers.
 
+# A hyphen following non-breaking space that carries an intervening combining
+# mark is treated as word-initial; by LB20a it has no break opportunity after
+# it.  A bug in ICU 76 incorrectly handled that case (ICU-22986).
+<data>• ̄-k•</data>
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk
diff --git a/...ain/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk b/...ain/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk
diff --git a/...in/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk b/...in/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk
diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk
diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -1276,7 +1276,6 @@ int next(int startPos) {
                         fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
                         fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
                         fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
-                        fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
                         fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
                         setAppliedRule(pos, "LB 20a");
                         continue;
@@ -1285,7 +1284,8 @@ int next(int startPos) {
                             fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
                         breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1);
                     }
-                    if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                    if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                            fGL.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
                         setAppliedRule(pos, "LB 20a");
                         continue;
                     }

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt
@@ -176,7 +176,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt
@@ -180,7 +180,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt
@@ -181,7 +181,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -200,7 +200,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA BAX HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt
@@ -182,7 +182,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -186,7 +186,7 @@ LB11.2:      SP WJ;
 LB11.3:      WJ CM* [^CM];
 
 # Needs to apply before LB12, because the new monkeys are not greedy.
-LB20a.2:   GL (HY | HH) CM* AL;
+LB20a.2:   GL CM* (HY | HH) CM* AL;
 LB12:      GL CM* [^CM];
 
 LB12a:       [^SP BA HY] CM* GL;

diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -2214,3 +2214,7 @@ Bangkok)•</data>
 <data>•« Complex »« chaining » •</data>
 <data>•« .618 »•</data>  # Interaction with the ICU tailoring to break before such numbers.
 
+# A hyphen following non-breaking space that carries an intervening combining
+# mark is treated as word-initial; by LB20a it has no break opportunity after
+# it.  A bug in ICU 76 incorrectly handled that case (ICU-22986).
+<data>• ̄-k•</data>