Skip to content

Commit 9a02f4d

Browse files
committed
mk_invlists: Use new set subtraction ability
This allows the removal of some combinatorial complexity, thus showing a bug in which the combination of PO to EOP had not been added when it should have been. Currently, mktables splits the Line Break OP and CP classes into East Asian ones, and the remainders. The extra combinations occurred because the code here needed to take every existing OP and add an East_Asian (EA_OP) equivalent; same with CP. It's easy to miss one, and I did. This commit allows this split to be hidden from most places in mk_invlists.
1 parent 5810cf7 commit 9a02f4d

File tree

2 files changed

+10
-25
lines changed

2 files changed

+10
-25
lines changed

β€Žcharclass_invlists.incβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192630,7 +192630,7 @@ static const U8 LB_table[42][42] = {
192630192630
/* NS*/ { 1, 1, 0, 1, 0, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0,1,0 },
192631192631
/* NU*/ { 0, 1, 0, 1, 0, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0,1,0 },
192632192632
/* OP*/ { 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2,1,0 },
192633-
/* PO*/ { 0, 1, 0, 1, 0, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 0,10, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0,1,0 },
192633+
/* PO*/ { 0, 1, 0, 1, 0, 1, 2, 0, 2, 0, 10, 1, 1, 2, 0, 1, 1, 0, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 0,10, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0,1,0 },
192634192634
/* PR*/ { 0, 1, 0, 1, 0, 1, 2, 0, 2, 0, 10, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,10, 1, 1, 0, 1, 0, 2, 0, 0, 0, 0,1,0 },
192635192635
/* QU*/ { 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,1,0 },
192636192636
/* RI*/ { 1, 1, 0, 1, 0, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,15, 0, 2, 1, 0, 0, 0,1,0 },

β€Žregen/mk_invlists.plβ€Ž

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,6 +1986,9 @@ ()
19861986

19871987
my %lb_splits = setup_splits(\%lb_all_enums, $table_size, $has_unused,
19881988
{
1989+
CP => [ qw(CP East_Asian_CP) ],
1990+
OP => [ qw(OP East_Asian_OP) ],
1991+
EA => [ qw(East_Asian_OP East_Asian_CP) ],
19891992
Ideographic => [ 'Ideographic',
19901993
'Unassigned_Extended_Pictographic_Ideographic'
19911994
],
@@ -2104,17 +2107,15 @@ ()
21042107
# non-East-Asian opening punctuation nor non-East-Asian closing
21052108
# parentheses.
21062109
# (AL | HL | NU) Γ— [OP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
2107-
# (what we call CP and OP here have already been modified by mktables to
2108-
# exclude the ea items
21092110
$rule = 30;
2110-
set_lb_nobreak('Alphabetic', 'Open_Punctuation', $rule);
2111-
set_lb_nobreak('Hebrew_Letter', 'Open_Punctuation', $rule);
2112-
set_lb_nobreak('Numeric', 'Open_Punctuation', $rule);
2111+
set_lb_nobreak('Alphabetic', 'Open_Punctuation_sans_EA', $rule);
2112+
set_lb_nobreak('Hebrew_Letter', 'Open_Punctuation_sans_EA', $rule);
2113+
set_lb_nobreak('Numeric', 'Open_Punctuation_sans_EA', $rule);
21132114

21142115
# [CP-[\p{ea=F}\p{ea=W}\p{ea=H}]] Γ— (AL | HL | NU)
2115-
set_lb_nobreak('Close_Parenthesis', 'Alphabetic', $rule);
2116-
set_lb_nobreak('Close_Parenthesis', 'Hebrew_Letter', $rule);
2117-
set_lb_nobreak('Close_Parenthesis', 'Numeric', $rule);
2116+
set_lb_nobreak('Close_Parenthesis_sans_EA', 'Alphabetic', $rule);
2117+
set_lb_nobreak('Close_Parenthesis_sans_EA', 'Hebrew_Letter', $rule);
2118+
set_lb_nobreak('Close_Parenthesis_sans_EA', 'Numeric', $rule);
21182119

21192120
# LB29 Do not break between numeric punctuation and alphabetics (β€œe.g.”).
21202121
# IS Γ— (AL | HL)
@@ -2180,16 +2181,13 @@ ()
21802181
# Given that (OP | HY )? is optional, we have to test for it in code.
21812182
add_lb_dfa('Prefix_Numeric', 'Open_Punctuation',
21822183
'LB_PR_or_PO_then_OP_or_HY', $rule);
2183-
add_lb_dfa('Prefix_Numeric', 'East_Asian_OP',
2184-
'LB_PR_or_PO_then_OP_or_HY', $rule);
21852184
add_lb_dfa('Postfix_Numeric', 'Open_Punctuation',
21862185
'LB_PR_or_PO_then_OP_or_HY', $rule);
21872186
add_lb_dfa('Prefix_Numeric', 'Hyphen', 'LB_PR_or_PO_then_OP_or_HY', $rule);
21882187
add_lb_dfa('Postfix_Numeric', 'Hyphen', 'LB_PR_or_PO_then_OP_or_HY', $rule);
21892188

21902189
# ( OP | HY ) Γ— NU
21912190
set_lb_nobreak('Open_Punctuation', 'Numeric', $rule);
2192-
set_lb_nobreak('East_Asian_OP', 'Numeric', $rule);
21932191
set_lb_nobreak('Hyphen', 'Numeric', $rule);
21942192

21952193
# NU (NU | SY | IS)* Γ— (NU | SY | IS | CL | CP )
@@ -2200,7 +2198,6 @@ ()
22002198
set_lb_nobreak('Numeric', 'Infix_Numeric', $rule);
22012199
set_lb_nobreak('Numeric', 'Close_Punctuation', $rule);
22022200
set_lb_nobreak('Numeric', 'Close_Parenthesis', $rule);
2203-
set_lb_nobreak('Numeric', 'East_Asian_CP', $rule);
22042201

22052202
# And then to
22062203
# NU (SY | IS)+ Γ— (NU | SY | IS | CL | CP )
@@ -2213,8 +2210,6 @@ ()
22132210
'LB_SY_or_IS_then_various', $rule);
22142211
add_lb_dfa('Break_Symbols', 'Close_Parenthesis',
22152212
'LB_SY_or_IS_then_various', $rule);
2216-
add_lb_dfa('Break_Symbols', 'East_Asian_CP',
2217-
'LB_SY_or_IS_then_various', $rule);
22182213
add_lb_dfa('Infix_Numeric', 'Numeric', 'LB_SY_or_IS_then_various', $rule);
22192214
add_lb_dfa('Infix_Numeric', 'Break_Symbols',
22202215
'LB_SY_or_IS_then_various', $rule);
@@ -2224,8 +2219,6 @@ ()
22242219
'LB_SY_or_IS_then_various', $rule);
22252220
add_lb_dfa('Infix_Numeric', 'Close_Parenthesis',
22262221
'LB_SY_or_IS_then_various', $rule);
2227-
add_lb_dfa('Infix_Numeric', 'East_Asian_CP',
2228-
'LB_SY_or_IS_then_various', $rule);
22292222

22302223
# NU (NU | SY | IS)* (CL | CP)? Γ— (PO | PR)
22312224
# We can eliminate the NU in the parenthesis, as there is a match as long
@@ -2236,8 +2229,6 @@ ()
22362229

22372230
add_lb_dfa('Close_Parenthesis', 'Postfix_Numeric',
22382231
'LB_various_then_PO_or_PR', $rule);
2239-
add_lb_dfa('East_Asian_CP', 'Postfix_Numeric',
2240-
'LB_various_then_PO_or_PR', $rule);
22412232
add_lb_dfa('Close_Punctuation', 'Postfix_Numeric',
22422233
'LB_various_then_PO_or_PR', $rule);
22432234
add_lb_dfa('Infix_Numeric', 'Postfix_Numeric',
@@ -2247,8 +2238,6 @@ ()
22472238

22482239
add_lb_dfa('Close_Parenthesis', 'Prefix_Numeric',
22492240
'LB_various_then_PO_or_PR', $rule);
2250-
add_lb_dfa('East_Asian_CP', 'Prefix_Numeric',
2251-
'LB_various_then_PO_or_PR', $rule);
22522241
add_lb_dfa('Close_Punctuation', 'Prefix_Numeric',
22532242
'LB_various_then_PO_or_PR', $rule);
22542243
add_lb_dfa('Infix_Numeric', 'Prefix_Numeric',
@@ -2350,17 +2339,14 @@ ()
23502339
$rule = 16;
23512340
set_lb_nobreak_ignoring_SP('Close_Punctuation', 'Nonstarter', $rule);
23522341
set_lb_nobreak_ignoring_SP('Close_Parenthesis', 'Nonstarter', $rule);
2353-
set_lb_nobreak_ignoring_SP('East_Asian_CP', 'Nonstarter', $rule);
23542342

23552343
# LB15 Do not break within β€˜β€[’, even with intervening spaces.
23562344
# QU SP* Γ— OP
23572345
set_lb_nobreak_ignoring_SP('Quotation', 'Open_Punctuation', 15);
2358-
set_lb_nobreak_ignoring_SP('Quotation', 'East_Asian_OP', 15);
23592346

23602347
# LB14 Do not break after β€˜[’, even after spaces.
23612348
# OP SP* Γ—
23622349
set_lb_nobreak_ignoring_SP('Open_Punctuation', '*', 14);
2363-
set_lb_nobreak_ignoring_SP('East_Asian_OP', '*', 14);
23642350

23652351
# LB13 Do not break before β€˜]’ or β€˜!’ or β€˜;’ or β€˜/’, even after spaces, as
23662352
# tailored by example 7 in http://www.unicode.org/reports/tr14/#Examples
@@ -2372,7 +2358,6 @@ ()
23722358
$rule = 13;
23732359
set_lb_nobreak_ignoring_SP('*', 'Close_Punctuation', $rule);
23742360
set_lb_nobreak_ignoring_SP('*', 'Close_Parenthesis', $rule);
2375-
set_lb_nobreak_ignoring_SP('*', 'East_Asian_CP', $rule);
23762361
set_lb_nobreak_ignoring_SP('*', 'Exclamation', $rule);
23772362
set_lb_nobreak_ignoring_SP('*', 'Infix_Numeric', $rule);
23782363
set_lb_nobreak_ignoring_SP('*', 'Break_Symbols', $rule);

0 commit comments

Comments
Β (0)