Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-91575: Add a script for generating data for case-insensitive matching in re #91660

Merged
merged 2 commits into from
Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions Lib/re/_casefix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Auto-generated by Tools/scripts/generate_re_casefix.py.

# Maps the code of lowercased character to codes of different lowercased
# characters which have the same uppercase.
_EXTRA_CASES = {
# LATIN SMALL LETTER I: LATIN SMALL LETTER DOTLESS I
0x0069: (0x0131,), # 'i': 'ı'
# LATIN SMALL LETTER S: LATIN SMALL LETTER LONG S
0x0073: (0x017f,), # 's': 'ſ'
# MICRO SIGN: GREEK SMALL LETTER MU
0x00b5: (0x03bc,), # 'µ': 'μ'
# LATIN SMALL LETTER DOTLESS I: LATIN SMALL LETTER I
0x0131: (0x0069,), # 'ı': 'i'
# LATIN SMALL LETTER LONG S: LATIN SMALL LETTER S
0x017f: (0x0073,), # 'ſ': 's'
# COMBINING GREEK YPOGEGRAMMENI: GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
0x0345: (0x03b9, 0x1fbe), # '\u0345': 'ιι'
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
0x0390: (0x1fd3,), # 'ΐ': 'ΐ'
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
0x03b0: (0x1fe3,), # 'ΰ': 'ΰ'
# GREEK SMALL LETTER BETA: GREEK BETA SYMBOL
0x03b2: (0x03d0,), # 'β': 'ϐ'
# GREEK SMALL LETTER EPSILON: GREEK LUNATE EPSILON SYMBOL
0x03b5: (0x03f5,), # 'ε': 'ϵ'
# GREEK SMALL LETTER THETA: GREEK THETA SYMBOL
0x03b8: (0x03d1,), # 'θ': 'ϑ'
# GREEK SMALL LETTER IOTA: COMBINING GREEK YPOGEGRAMMENI, GREEK PROSGEGRAMMENI
0x03b9: (0x0345, 0x1fbe), # 'ι': '\u0345ι'
# GREEK SMALL LETTER KAPPA: GREEK KAPPA SYMBOL
0x03ba: (0x03f0,), # 'κ': 'ϰ'
# GREEK SMALL LETTER MU: MICRO SIGN
0x03bc: (0x00b5,), # 'μ': 'µ'
# GREEK SMALL LETTER PI: GREEK PI SYMBOL
0x03c0: (0x03d6,), # 'π': 'ϖ'
# GREEK SMALL LETTER RHO: GREEK RHO SYMBOL
0x03c1: (0x03f1,), # 'ρ': 'ϱ'
# GREEK SMALL LETTER FINAL SIGMA: GREEK SMALL LETTER SIGMA
0x03c2: (0x03c3,), # 'ς': 'σ'
# GREEK SMALL LETTER SIGMA: GREEK SMALL LETTER FINAL SIGMA
0x03c3: (0x03c2,), # 'σ': 'ς'
# GREEK SMALL LETTER PHI: GREEK PHI SYMBOL
0x03c6: (0x03d5,), # 'φ': 'ϕ'
# GREEK BETA SYMBOL: GREEK SMALL LETTER BETA
0x03d0: (0x03b2,), # 'ϐ': 'β'
# GREEK THETA SYMBOL: GREEK SMALL LETTER THETA
0x03d1: (0x03b8,), # 'ϑ': 'θ'
# GREEK PHI SYMBOL: GREEK SMALL LETTER PHI
0x03d5: (0x03c6,), # 'ϕ': 'φ'
# GREEK PI SYMBOL: GREEK SMALL LETTER PI
0x03d6: (0x03c0,), # 'ϖ': 'π'
# GREEK KAPPA SYMBOL: GREEK SMALL LETTER KAPPA
0x03f0: (0x03ba,), # 'ϰ': 'κ'
# GREEK RHO SYMBOL: GREEK SMALL LETTER RHO
0x03f1: (0x03c1,), # 'ϱ': 'ρ'
# GREEK LUNATE EPSILON SYMBOL: GREEK SMALL LETTER EPSILON
0x03f5: (0x03b5,), # 'ϵ': 'ε'
# CYRILLIC SMALL LETTER VE: CYRILLIC SMALL LETTER ROUNDED VE
0x0432: (0x1c80,), # 'в': 'ᲀ'
# CYRILLIC SMALL LETTER DE: CYRILLIC SMALL LETTER LONG-LEGGED DE
0x0434: (0x1c81,), # 'д': 'ᲁ'
# CYRILLIC SMALL LETTER O: CYRILLIC SMALL LETTER NARROW O
0x043e: (0x1c82,), # 'о': 'ᲂ'
# CYRILLIC SMALL LETTER ES: CYRILLIC SMALL LETTER WIDE ES
0x0441: (0x1c83,), # 'с': 'ᲃ'
# CYRILLIC SMALL LETTER TE: CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
0x0442: (0x1c84, 0x1c85), # 'т': 'ᲄᲅ'
# CYRILLIC SMALL LETTER HARD SIGN: CYRILLIC SMALL LETTER TALL HARD SIGN
0x044a: (0x1c86,), # 'ъ': 'ᲆ'
# CYRILLIC SMALL LETTER YAT: CYRILLIC SMALL LETTER TALL YAT
0x0463: (0x1c87,), # 'ѣ': 'ᲇ'
# CYRILLIC SMALL LETTER ROUNDED VE: CYRILLIC SMALL LETTER VE
0x1c80: (0x0432,), # 'ᲀ': 'в'
# CYRILLIC SMALL LETTER LONG-LEGGED DE: CYRILLIC SMALL LETTER DE
0x1c81: (0x0434,), # 'ᲁ': 'д'
# CYRILLIC SMALL LETTER NARROW O: CYRILLIC SMALL LETTER O
0x1c82: (0x043e,), # 'ᲂ': 'о'
# CYRILLIC SMALL LETTER WIDE ES: CYRILLIC SMALL LETTER ES
0x1c83: (0x0441,), # 'ᲃ': 'с'
# CYRILLIC SMALL LETTER TALL TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
0x1c84: (0x0442, 0x1c85), # 'ᲄ': 'тᲅ'
# CYRILLIC SMALL LETTER THREE-LEGGED TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE
0x1c85: (0x0442, 0x1c84), # 'ᲅ': 'тᲄ'
# CYRILLIC SMALL LETTER TALL HARD SIGN: CYRILLIC SMALL LETTER HARD SIGN
0x1c86: (0x044a,), # 'ᲆ': 'ъ'
# CYRILLIC SMALL LETTER TALL YAT: CYRILLIC SMALL LETTER YAT
0x1c87: (0x0463,), # 'ᲇ': 'ѣ'
# CYRILLIC SMALL LETTER UNBLENDED UK: CYRILLIC SMALL LETTER MONOGRAPH UK
0x1c88: (0xa64b,), # 'ᲈ': 'ꙋ'
# LATIN SMALL LETTER S WITH DOT ABOVE: LATIN SMALL LETTER LONG S WITH DOT ABOVE
0x1e61: (0x1e9b,), # 'ṡ': 'ẛ'
# LATIN SMALL LETTER LONG S WITH DOT ABOVE: LATIN SMALL LETTER S WITH DOT ABOVE
0x1e9b: (0x1e61,), # 'ẛ': 'ṡ'
# GREEK PROSGEGRAMMENI: COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA
0x1fbe: (0x0345, 0x03b9), # 'ι': '\u0345ι'
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
0x1fd3: (0x0390,), # 'ΐ': 'ΐ'
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
0x1fe3: (0x03b0,), # 'ΰ': 'ΰ'
# CYRILLIC SMALL LETTER MONOGRAPH UK: CYRILLIC SMALL LETTER UNBLENDED UK
0xa64b: (0x1c88,), # 'ꙋ': 'ᲈ'
# LATIN SMALL LIGATURE LONG S T: LATIN SMALL LIGATURE ST
0xfb05: (0xfb06,), # 'ſt': 'st'
# LATIN SMALL LIGATURE ST: LATIN SMALL LIGATURE LONG S T
0xfb06: (0xfb05,), # 'st': 'ſt'
}
59 changes: 2 additions & 57 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import _sre
from . import _parser
from ._constants import *
from ._casefix import _EXTRA_CASES

assert _sre.MAGIC == MAGIC, "SRE module mismatch"

Expand All @@ -27,62 +28,6 @@
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}

# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ιι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
(0x432, 0x1c80), # вᲀ
# CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
(0x434, 0x1c81), # дᲁ
# CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
(0x43e, 0x1c82), # оᲂ
# CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
(0x441, 0x1c83), # сᲃ
# CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
(0x442, 0x1c84, 0x1c85), # тᲄᲅ
# CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
(0x44a, 0x1c86), # ъᲆ
# CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
(0x463, 0x1c87), # ѣᲇ
# CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
(0x1c88, 0xa64b), # ᲈꙋ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)

# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}

class _CompileData:
__slots__ = ('code', 'repeat_count')
def __init__(self):
Expand Down Expand Up @@ -111,7 +56,7 @@ def _compile(data, pattern, flags):
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
fixes = _EXTRA_CASES
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
Expand Down
6 changes: 6 additions & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,12 @@ regen-test-frozenmain: $(BUILDPYTHON)
# using Programs/freeze_test_frozenmain.py
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Programs/freeze_test_frozenmain.py Programs/test_frozenmain.h

.PHONY: regen-re
regen-re: $(BUILDPYTHON)
# Regenerate Lib/re/_casefix.py
# using Tools/scripts/generate_re_casefix.py
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/scripts/generate_re_casefix.py $(srcdir)/Lib/re/_casefix.py

Programs/_testembed: Programs/_testembed.o $(LINK_PYTHON_DEPS)
$(LINKCC) $(PY_CORE_LDFLAGS) $(LINKFORSHARED) -o $@ Programs/_testembed.o $(LINK_PYTHON_OBJS) $(LIBS) $(MODLIBS) $(SYSLIBS)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add script ``Tools/scripts/generate_re_casefix.py`` and the make target
``regen-re`` for generating additional data for case-insensitive matching
according to the current Unicode version.
95 changes: 95 additions & 0 deletions Tools/scripts/generate_re_casefix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#! /usr/bin/env python3
# This script generates Lib/re/_casefix.py.

import collections
import re
import sys
import unicodedata

def update_file(file, content):
try:
with open(file, 'r', encoding='utf-8') as fobj:
if fobj.read() == content:
return False
except (OSError, ValueError):
pass
with open(file, 'w', encoding='utf-8') as fobj:
fobj.write(content)
return True

re_casefix_template = """\
# Auto-generated by Tools/scripts/generate_re_casefix.py.

# Maps the code of lowercased character to codes of different lowercased
# characters which have the same uppercase.
_EXTRA_CASES = {
%s
}
"""

def uname(i):
return unicodedata.name(chr(i), r'U+%04X' % i)

class hexint(int):
def __repr__(self):
return '%#06x' % self

def alpha(i):
c = chr(i)
return c if c.isalpha() else ascii(c)[1:-1]


def main(outfile='Lib/re/_casefix.py'):
# Find sets of characters which have the same uppercase.
equivalent_chars = collections.defaultdict(str)
for c in map(chr, range(sys.maxunicode + 1)):
equivalent_chars[c.upper()] += c
equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]

# List of codes of lowercased characters which have the same uppercase.
equivalent_lower_codes = [sorted(t)
for s in equivalent_chars
for t in [set(ord(c.lower()) for c in s)]
if len(t) > 1]

bad_codes = []
for t in equivalent_lower_codes:
for i in t:
if i > 0xffff:
bad_codes.extend(t)
try:
bad_codes.append(ord(chr(i).upper()))
except (ValueError, TypeError):
pass
break
if bad_codes:
print('Case-insensitive matching may not work correctly for character:',
file=sys.stderr)
for i in sorted(bad_codes):
print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
file=sys.stderr)
sys.exit(1)

mapping = {i: tuple(j for j in t if i != j)
for t in equivalent_lower_codes
for i in t}

items = []
for i, t in sorted(mapping.items()):
items.append(' # %s: %s' % (
uname(i),
', '.join(map(uname, t)),
))
items.append(" %r: %r, # '%s': '%s'" % (
hexint(i),
tuple(map(hexint, t)),
alpha(i),
''.join(map(alpha, t)),
))

update_file(outfile, re_casefix_template % '\n'.join(items))


if __name__ == '__main__':
import sys
main(*sys.argv[1:])