Skip to content

gh-91575: Update case-insensitive matching in re to the latest Unicode version #91580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
(0x432, 0x1c80), # вᲀ
# CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
(0x434, 0x1c81), # дᲁ
# CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
(0x43e, 0x1c82), # оᲂ
# CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
(0x441, 0x1c83), # сᲃ
# CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
(0x442, 0x1c84, 0x1c85), # тᲄᲅ
# CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
(0x44a, 0x1c86), # ъᲆ
# CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
(0x463, 0x1c87), # ѣᲇ
# CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
(0x1c88, 0xa64b), # ᲈꙋ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
Expand Down Expand Up @@ -339,11 +355,19 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
# For range, all BMP characters in the range are already
# proceeded.
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
# For now, IN_UNI_IGNORE+LITERAL and
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
# characters, because two characters (at least one of
# which is not in the BMP) match case-insensitively
# if and only if:
# 1) c1.lower() == c2.lower()
# 2) c1.lower() == c2 or c1.lower().upper() == c2
# Also, both c.lower() and c.lower().upper() are single
# characters for every non-BMP character.
if op is RANGE:
op = RANGE_UNI_IGNORE
tail.append((op, av))
Expand Down
55 changes: 49 additions & 6 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,16 +859,30 @@ def test_ignore_case(self):
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

assert '\u212a'.lower() == 'k' # 'K'
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
self.assertTrue(re.match(r'K', '\u212a', re.I))
self.assertTrue(re.match(r'k', '\u212a', re.I))
self.assertTrue(re.match(r'\u212a', 'K', re.I))
self.assertTrue(re.match(r'\u212a', 'k', re.I))
assert '\u017f'.upper() == 'S' # 'ſ'

# Two different characters have the same uppercase.
assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
self.assertTrue(re.match(r'S', '\u017f', re.I))
self.assertTrue(re.match(r's', '\u017f', re.I))
self.assertTrue(re.match(r'\u017f', 'S', re.I))
self.assertTrue(re.match(r'\u017f', 's', re.I))

# Two different characters have the same uppercase. Unicode 9.0+.
assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
self.assertTrue(re.match(r'\u0412', '\u0432', re.I))
self.assertTrue(re.match(r'\u0412', '\u1c80', re.I))
self.assertTrue(re.match(r'\u0432', '\u0412', re.I))
self.assertTrue(re.match(r'\u0432', '\u1c80', re.I))
self.assertTrue(re.match(r'\u1c80', '\u0412', re.I))
self.assertTrue(re.match(r'\u1c80', '\u0432', re.I))

# Two different characters have the same multicharacter uppercase.
assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
Expand All @@ -882,16 +896,31 @@ def test_ignore_case_set(self):
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
assert '\u212a'.lower() == 'k' # 'K'

# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
assert '\u017f'.upper() == 'S' # 'ſ'

# Two different characters have the same uppercase.
assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
self.assertTrue(re.match(r'[19\u017f]', 's', re.I))

# Two different characters have the same uppercase. Unicode 9.0+.
assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
self.assertTrue(re.match(r'[19\u0412]', '\u0432', re.I))
self.assertTrue(re.match(r'[19\u0412]', '\u1c80', re.I))
self.assertTrue(re.match(r'[19\u0432]', '\u0412', re.I))
self.assertTrue(re.match(r'[19\u0432]', '\u1c80', re.I))
self.assertTrue(re.match(r'[19\u1c80]', '\u0412', re.I))
self.assertTrue(re.match(r'[19\u1c80]', '\u0432', re.I))

# Two different characters have the same multicharacter uppercase.
assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
Expand All @@ -915,16 +944,30 @@ def test_ignore_case_range(self):
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))

assert '\u212a'.lower() == 'k' # 'K'
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
assert '\u017f'.upper() == 'S' # 'ſ'

# Two different characters have the same uppercase.
assert 's'.upper() == '\u017f'.upper() == 'S' # 'ſ'
self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))

# Two different characters have the same uppercase. Unicode 9.0+.
assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'в', 'ᲀ', 'В'
self.assertTrue(re.match(r'[\u0411-\u0413]', '\u0432', re.I))
self.assertTrue(re.match(r'[\u0411-\u0413]', '\u1c80', re.I))
self.assertTrue(re.match(r'[\u0431-\u0433]', '\u0412', re.I))
self.assertTrue(re.match(r'[\u0431-\u0433]', '\u1c80', re.I))
self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0412', re.I))
self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0432', re.I))

# Two different characters have the same multicharacter uppercase.
assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Update case-insensitive matching in the :mod:`re` module to the latest
Unicode version.