Skip to content

Commit f912cc0

Browse files
gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)
Also test that all extra cases are in BMP.
1 parent 48ec61a commit f912cc0

File tree

5 files changed

+212
-57
lines changed

5 files changed

+212
-57
lines changed

Lib/re/_casefix.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Auto-generated by Tools/scripts/generate_re_casefix.py.
2+
3+
# Maps the code of lowercased character to codes of different lowercased
4+
# characters which have the same uppercase.
5+
_EXTRA_CASES = {
6+
# LATIN SMALL LETTER I: LATIN SMALL LETTER DOTLESS I
7+
0x0069: (0x0131,), # 'i': 'ı'
8+
# LATIN SMALL LETTER S: LATIN SMALL LETTER LONG S
9+
0x0073: (0x017f,), # 's': 'ſ'
10+
# MICRO SIGN: GREEK SMALL LETTER MU
11+
0x00b5: (0x03bc,), # 'µ': 'μ'
12+
# LATIN SMALL LETTER DOTLESS I: LATIN SMALL LETTER I
13+
0x0131: (0x0069,), # 'ı': 'i'
14+
# LATIN SMALL LETTER LONG S: LATIN SMALL LETTER S
15+
0x017f: (0x0073,), # 'ſ': 's'
16+
# COMBINING GREEK YPOGEGRAMMENI: GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
17+
0x0345: (0x03b9, 0x1fbe), # '\u0345': 'ιι'
18+
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
19+
0x0390: (0x1fd3,), # 'ΐ': 'ΐ'
20+
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
21+
0x03b0: (0x1fe3,), # 'ΰ': 'ΰ'
22+
# GREEK SMALL LETTER BETA: GREEK BETA SYMBOL
23+
0x03b2: (0x03d0,), # 'β': 'ϐ'
24+
# GREEK SMALL LETTER EPSILON: GREEK LUNATE EPSILON SYMBOL
25+
0x03b5: (0x03f5,), # 'ε': 'ϵ'
26+
# GREEK SMALL LETTER THETA: GREEK THETA SYMBOL
27+
0x03b8: (0x03d1,), # 'θ': 'ϑ'
28+
# GREEK SMALL LETTER IOTA: COMBINING GREEK YPOGEGRAMMENI, GREEK PROSGEGRAMMENI
29+
0x03b9: (0x0345, 0x1fbe), # 'ι': '\u0345ι'
30+
# GREEK SMALL LETTER KAPPA: GREEK KAPPA SYMBOL
31+
0x03ba: (0x03f0,), # 'κ': 'ϰ'
32+
# GREEK SMALL LETTER MU: MICRO SIGN
33+
0x03bc: (0x00b5,), # 'μ': 'µ'
34+
# GREEK SMALL LETTER PI: GREEK PI SYMBOL
35+
0x03c0: (0x03d6,), # 'π': 'ϖ'
36+
# GREEK SMALL LETTER RHO: GREEK RHO SYMBOL
37+
0x03c1: (0x03f1,), # 'ρ': 'ϱ'
38+
# GREEK SMALL LETTER FINAL SIGMA: GREEK SMALL LETTER SIGMA
39+
0x03c2: (0x03c3,), # 'ς': 'σ'
40+
# GREEK SMALL LETTER SIGMA: GREEK SMALL LETTER FINAL SIGMA
41+
0x03c3: (0x03c2,), # 'σ': 'ς'
42+
# GREEK SMALL LETTER PHI: GREEK PHI SYMBOL
43+
0x03c6: (0x03d5,), # 'φ': 'ϕ'
44+
# GREEK BETA SYMBOL: GREEK SMALL LETTER BETA
45+
0x03d0: (0x03b2,), # 'ϐ': 'β'
46+
# GREEK THETA SYMBOL: GREEK SMALL LETTER THETA
47+
0x03d1: (0x03b8,), # 'ϑ': 'θ'
48+
# GREEK PHI SYMBOL: GREEK SMALL LETTER PHI
49+
0x03d5: (0x03c6,), # 'ϕ': 'φ'
50+
# GREEK PI SYMBOL: GREEK SMALL LETTER PI
51+
0x03d6: (0x03c0,), # 'ϖ': 'π'
52+
# GREEK KAPPA SYMBOL: GREEK SMALL LETTER KAPPA
53+
0x03f0: (0x03ba,), # 'ϰ': 'κ'
54+
# GREEK RHO SYMBOL: GREEK SMALL LETTER RHO
55+
0x03f1: (0x03c1,), # 'ϱ': 'ρ'
56+
# GREEK LUNATE EPSILON SYMBOL: GREEK SMALL LETTER EPSILON
57+
0x03f5: (0x03b5,), # 'ϵ': 'ε'
58+
# CYRILLIC SMALL LETTER VE: CYRILLIC SMALL LETTER ROUNDED VE
59+
0x0432: (0x1c80,), # 'в': 'ᲀ'
60+
# CYRILLIC SMALL LETTER DE: CYRILLIC SMALL LETTER LONG-LEGGED DE
61+
0x0434: (0x1c81,), # 'д': 'ᲁ'
62+
# CYRILLIC SMALL LETTER O: CYRILLIC SMALL LETTER NARROW O
63+
0x043e: (0x1c82,), # 'о': 'ᲂ'
64+
# CYRILLIC SMALL LETTER ES: CYRILLIC SMALL LETTER WIDE ES
65+
0x0441: (0x1c83,), # 'с': 'ᲃ'
66+
# CYRILLIC SMALL LETTER TE: CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
67+
0x0442: (0x1c84, 0x1c85), # 'т': 'ᲄᲅ'
68+
# CYRILLIC SMALL LETTER HARD SIGN: CYRILLIC SMALL LETTER TALL HARD SIGN
69+
0x044a: (0x1c86,), # 'ъ': 'ᲆ'
70+
# CYRILLIC SMALL LETTER YAT: CYRILLIC SMALL LETTER TALL YAT
71+
0x0463: (0x1c87,), # 'ѣ': 'ᲇ'
72+
# CYRILLIC SMALL LETTER ROUNDED VE: CYRILLIC SMALL LETTER VE
73+
0x1c80: (0x0432,), # 'ᲀ': 'в'
74+
# CYRILLIC SMALL LETTER LONG-LEGGED DE: CYRILLIC SMALL LETTER DE
75+
0x1c81: (0x0434,), # 'ᲁ': 'д'
76+
# CYRILLIC SMALL LETTER NARROW O: CYRILLIC SMALL LETTER O
77+
0x1c82: (0x043e,), # 'ᲂ': 'о'
78+
# CYRILLIC SMALL LETTER WIDE ES: CYRILLIC SMALL LETTER ES
79+
0x1c83: (0x0441,), # 'ᲃ': 'с'
80+
# CYRILLIC SMALL LETTER TALL TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
81+
0x1c84: (0x0442, 0x1c85), # 'ᲄ': 'тᲅ'
82+
# CYRILLIC SMALL LETTER THREE-LEGGED TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE
83+
0x1c85: (0x0442, 0x1c84), # 'ᲅ': 'тᲄ'
84+
# CYRILLIC SMALL LETTER TALL HARD SIGN: CYRILLIC SMALL LETTER HARD SIGN
85+
0x1c86: (0x044a,), # 'ᲆ': 'ъ'
86+
# CYRILLIC SMALL LETTER TALL YAT: CYRILLIC SMALL LETTER YAT
87+
0x1c87: (0x0463,), # 'ᲇ': 'ѣ'
88+
# CYRILLIC SMALL LETTER UNBLENDED UK: CYRILLIC SMALL LETTER MONOGRAPH UK
89+
0x1c88: (0xa64b,), # 'ᲈ': 'ꙋ'
90+
# LATIN SMALL LETTER S WITH DOT ABOVE: LATIN SMALL LETTER LONG S WITH DOT ABOVE
91+
0x1e61: (0x1e9b,), # 'ṡ': 'ẛ'
92+
# LATIN SMALL LETTER LONG S WITH DOT ABOVE: LATIN SMALL LETTER S WITH DOT ABOVE
93+
0x1e9b: (0x1e61,), # 'ẛ': 'ṡ'
94+
# GREEK PROSGEGRAMMENI: COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA
95+
0x1fbe: (0x0345, 0x03b9), # 'ι': '\u0345ι'
96+
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
97+
0x1fd3: (0x0390,), # 'ΐ': 'ΐ'
98+
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
99+
0x1fe3: (0x03b0,), # 'ΰ': 'ΰ'
100+
# CYRILLIC SMALL LETTER MONOGRAPH UK: CYRILLIC SMALL LETTER UNBLENDED UK
101+
0xa64b: (0x1c88,), # 'ꙋ': 'ᲈ'
102+
# LATIN SMALL LIGATURE LONG S T: LATIN SMALL LIGATURE ST
103+
0xfb05: (0xfb06,), # 'ſt': 'st'
104+
# LATIN SMALL LIGATURE ST: LATIN SMALL LIGATURE LONG S T
105+
0xfb06: (0xfb05,), # 'st': 'ſt'
106+
}

Lib/re/_compiler.py

+2-57
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import _sre
1414
from . import _parser
1515
from ._constants import *
16+
from ._casefix import _EXTRA_CASES
1617

1718
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
1819

@@ -27,62 +28,6 @@
2728
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
2829
}
2930

30-
# Sets of lowercase characters which have the same uppercase.
31-
_equivalences = (
32-
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
33-
(0x69, 0x131), # iı
34-
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
35-
(0x73, 0x17f), # sſ
36-
# MICRO SIGN, GREEK SMALL LETTER MU
37-
(0xb5, 0x3bc), # µμ
38-
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
39-
(0x345, 0x3b9, 0x1fbe), # \u0345ιι
40-
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
41-
(0x390, 0x1fd3), # ΐΐ
42-
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
43-
(0x3b0, 0x1fe3), # ΰΰ
44-
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
45-
(0x3b2, 0x3d0), # βϐ
46-
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
47-
(0x3b5, 0x3f5), # εϵ
48-
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
49-
(0x3b8, 0x3d1), # θϑ
50-
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
51-
(0x3ba, 0x3f0), # κϰ
52-
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
53-
(0x3c0, 0x3d6), # πϖ
54-
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
55-
(0x3c1, 0x3f1), # ρϱ
56-
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
57-
(0x3c2, 0x3c3), # ςσ
58-
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
59-
(0x3c6, 0x3d5), # φϕ
60-
# CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
61-
(0x432, 0x1c80), # вᲀ
62-
# CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
63-
(0x434, 0x1c81), # дᲁ
64-
# CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
65-
(0x43e, 0x1c82), # оᲂ
66-
# CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
67-
(0x441, 0x1c83), # сᲃ
68-
# CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
69-
(0x442, 0x1c84, 0x1c85), # тᲄᲅ
70-
# CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
71-
(0x44a, 0x1c86), # ъᲆ
72-
# CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
73-
(0x463, 0x1c87), # ѣᲇ
74-
# CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
75-
(0x1c88, 0xa64b), # ᲈꙋ
76-
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
77-
(0x1e61, 0x1e9b), # ṡẛ
78-
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
79-
(0xfb05, 0xfb06), # ſtst
80-
)
81-
82-
# Maps the lowercase code to lowercase codes which have the same uppercase.
83-
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
84-
for t in _equivalences for i in t}
85-
8631
class _CompileData:
8732
__slots__ = ('code', 'repeat_count')
8833
def __init__(self):
@@ -111,7 +56,7 @@ def _compile(data, pattern, flags):
11156
if flags & SRE_FLAG_UNICODE:
11257
iscased = _sre.unicode_iscased
11358
tolower = _sre.unicode_tolower
114-
fixes = _ignorecase_fixes
59+
fixes = _EXTRA_CASES
11560
else:
11661
iscased = _sre.ascii_iscased
11762
tolower = _sre.ascii_tolower

Makefile.pre.in

+6
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,12 @@ regen-test-frozenmain: $(BUILDPYTHON)
948948
# using Programs/freeze_test_frozenmain.py
949949
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Programs/freeze_test_frozenmain.py Programs/test_frozenmain.h
950950

951+
.PHONY: regen-re
952+
regen-re: $(BUILDPYTHON)
953+
# Regenerate Lib/re/_casefix.py
954+
# using Tools/scripts/generate_re_casefix.py
955+
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/scripts/generate_re_casefix.py $(srcdir)/Lib/re/_casefix.py
956+
951957
Programs/_testembed: Programs/_testembed.o $(LINK_PYTHON_DEPS)
952958
$(LINKCC) $(PY_CORE_LDFLAGS) $(LINKFORSHARED) -o $@ Programs/_testembed.o $(LINK_PYTHON_OBJS) $(LIBS) $(MODLIBS) $(SYSLIBS)
953959

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add script ``Tools/scripts/generate_re_casefix.py`` and the make target
2+
``regen-re`` for generating additional data for case-insensitive matching
3+
according to the current Unicode version.

Tools/scripts/generate_re_casefix.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#! /usr/bin/env python3
2+
# This script generates Lib/re/_casefix.py.
3+
4+
import collections
5+
import re
6+
import sys
7+
import unicodedata
8+
9+
def update_file(file, content):
10+
try:
11+
with open(file, 'r', encoding='utf-8') as fobj:
12+
if fobj.read() == content:
13+
return False
14+
except (OSError, ValueError):
15+
pass
16+
with open(file, 'w', encoding='utf-8') as fobj:
17+
fobj.write(content)
18+
return True
19+
20+
re_casefix_template = """\
21+
# Auto-generated by Tools/scripts/generate_re_casefix.py.
22+
23+
# Maps the code of lowercased character to codes of different lowercased
24+
# characters which have the same uppercase.
25+
_EXTRA_CASES = {
26+
%s
27+
}
28+
"""
29+
30+
def uname(i):
31+
return unicodedata.name(chr(i), r'U+%04X' % i)
32+
33+
class hexint(int):
34+
def __repr__(self):
35+
return '%#06x' % self
36+
37+
def alpha(i):
38+
c = chr(i)
39+
return c if c.isalpha() else ascii(c)[1:-1]
40+
41+
42+
def main(outfile='Lib/re/_casefix.py'):
43+
# Find sets of characters which have the same uppercase.
44+
equivalent_chars = collections.defaultdict(str)
45+
for c in map(chr, range(sys.maxunicode + 1)):
46+
equivalent_chars[c.upper()] += c
47+
equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
48+
49+
# List of codes of lowercased characters which have the same uppercase.
50+
equivalent_lower_codes = [sorted(t)
51+
for s in equivalent_chars
52+
for t in [set(ord(c.lower()) for c in s)]
53+
if len(t) > 1]
54+
55+
bad_codes = []
56+
for t in equivalent_lower_codes:
57+
for i in t:
58+
if i > 0xffff:
59+
bad_codes.extend(t)
60+
try:
61+
bad_codes.append(ord(chr(i).upper()))
62+
except (ValueError, TypeError):
63+
pass
64+
break
65+
if bad_codes:
66+
print('Case-insensitive matching may not work correctly for character:',
67+
file=sys.stderr)
68+
for i in sorted(bad_codes):
69+
print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
70+
file=sys.stderr)
71+
sys.exit(1)
72+
73+
mapping = {i: tuple(j for j in t if i != j)
74+
for t in equivalent_lower_codes
75+
for i in t}
76+
77+
items = []
78+
for i, t in sorted(mapping.items()):
79+
items.append(' # %s: %s' % (
80+
uname(i),
81+
', '.join(map(uname, t)),
82+
))
83+
items.append(" %r: %r, # '%s': '%s'" % (
84+
hexint(i),
85+
tuple(map(hexint, t)),
86+
alpha(i),
87+
''.join(map(alpha, t)),
88+
))
89+
90+
update_file(outfile, re_casefix_template % '\n'.join(items))
91+
92+
93+
if __name__ == '__main__':
94+
import sys
95+
main(*sys.argv[1:])

0 commit comments

Comments
 (0)