From b7d6af1825eb3e9113b532fcdcd04f92ffa415e6 Mon Sep 17 00:00:00 2001 From: Wim Jeantine-Glenn Date: Sun, 28 Jul 2024 09:06:02 -0500 Subject: [PATCH 1/6] Remove re._compile Move implementation directly into re.compile. Simplifies tracebacks and reduces call stack. --- Lib/re/__init__.py | 107 ++++++++++++++++++++++---------------------- Lib/test/test_re.py | 2 +- 2 files changed, 54 insertions(+), 55 deletions(-) diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 7e8abbf6ffe155..8c5dd18e938633 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -164,17 +164,17 @@ class RegexFlag: def match(pattern, string, flags=0): """Try to apply the pattern at the start of the string, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).match(string) + return compile(pattern, flags).match(string) def fullmatch(pattern, string, flags=0): """Try to apply the pattern to all of the string, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).fullmatch(string) + return compile(pattern, flags).fullmatch(string) def search(pattern, string, flags=0): """Scan through string looking for a match to the pattern, returning a Match object, or None if no match was found.""" - return _compile(pattern, flags).search(string) + return compile(pattern, flags).search(string) class _ZeroSentinel(int): pass @@ -205,7 +205,7 @@ def sub(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel DeprecationWarning, stacklevel=2 ) - return _compile(pattern, flags).sub(repl, string, count) + return compile(pattern, flags).sub(repl, string, count) sub.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel): @@ -235,7 +235,7 @@ def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentine DeprecationWarning, stacklevel=2 ) - return _compile(pattern, flags).subn(repl, string, count) + return compile(pattern, flags).subn(repl, string, count) subn.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel): @@ -264,7 +264,7 @@ def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel) DeprecationWarning, stacklevel=2 ) - return _compile(pattern, flags).split(string, maxsplit) + return compile(pattern, flags).split(string, maxsplit) split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)' def findall(pattern, string, flags=0): @@ -275,60 +275,17 @@ def findall(pattern, string, flags=0): has more than one group. Empty matches are included in the result.""" - return _compile(pattern, flags).findall(string) + return compile(pattern, flags).findall(string) def finditer(pattern, string, flags=0): """Return an iterator over all non-overlapping matches in the string. For each match, the iterator returns a Match object. Empty matches are included in the result.""" - return _compile(pattern, flags).finditer(string) + return compile(pattern, flags).finditer(string) def compile(pattern, flags=0): - "Compile a regular expression pattern, returning a Pattern object." - return _compile(pattern, flags) - -def purge(): - "Clear the regular expression caches" - _cache.clear() - _cache2.clear() - _compile_template.cache_clear() - - -# SPECIAL_CHARS -# closing ')', '}' and ']' -# '-' (a range in character set) -# '&', '~', (extended character set operations) -# '#' (comment) and WHITESPACE (ignored) in verbose mode -_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'} - -def escape(pattern): - """ - Escape special characters in a string. - """ - if isinstance(pattern, str): - return pattern.translate(_special_chars_map) - else: - pattern = str(pattern, 'latin1') - return pattern.translate(_special_chars_map).encode('latin1') - -Pattern = type(_compiler.compile('', 0)) -Match = type(_compiler.compile('', 0).match('')) - -# -------------------------------------------------------------------- -# internals - -# Use the fact that dict keeps the insertion order. -# _cache2 uses the simple FIFO policy which has better latency. -# _cache uses the LRU policy which has better hit rate. -_cache = {} # LRU -_cache2 = {} # FIFO -_MAXCACHE = 512 -_MAXCACHE2 = 256 -assert _MAXCACHE2 < _MAXCACHE - -def _compile(pattern, flags): - # internal: compile pattern + """Compile a regular expression pattern, returning a Pattern object.""" if isinstance(flags, RegexFlag): flags = flags.value try: @@ -371,6 +328,45 @@ def _compile(pattern, flags): _cache2[key] = p return p +def purge(): + "Clear the regular expression caches" + _cache.clear() + _cache2.clear() + _compile_template.cache_clear() + + +# SPECIAL_CHARS +# closing ')', '}' and ']' +# '-' (a range in character set) +# '&', '~', (extended character set operations) +# '#' (comment) and WHITESPACE (ignored) in verbose mode +_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'} + +def escape(pattern): + """ + Escape special characters in a string. + """ + if isinstance(pattern, str): + return pattern.translate(_special_chars_map) + else: + pattern = str(pattern, 'latin1') + return pattern.translate(_special_chars_map).encode('latin1') + +Pattern = type(_compiler.compile('', 0)) +Match = type(_compiler.compile('', 0).match('')) + +# -------------------------------------------------------------------- +# internals + +# Use the fact that dict keeps the insertion order. +# _cache2 uses the simple FIFO policy which has better latency. +# _cache uses the LRU policy which has better hit rate. +_cache = {} # LRU +_cache2 = {} # FIFO +_MAXCACHE = 512 +_MAXCACHE2 = 256 +assert _MAXCACHE2 < _MAXCACHE + @functools.lru_cache(_MAXCACHE) def _compile_template(pattern, repl): # internal: compile replacement pattern @@ -381,9 +377,12 @@ def _compile_template(pattern, repl): import copyreg def _pickle(p): - return _compile, (p.pattern, p.flags) + return compile, (p.pattern, p.flags) + +# compatibility alias to deserialize old pickles +_compile = compile -copyreg.pickle(Pattern, _pickle, _compile) +copyreg.pickle(Pattern, _pickle, compile) # -------------------------------------------------------------------- # experimental stuff (see python-dev discussions for details) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index a93c2aef170fc8..f65c489c786b4e 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1227,7 +1227,7 @@ def test_pickling(self): pickled = pickle.dumps(oldpat, proto) newpat = pickle.loads(pickled) self.assertEqual(newpat, oldpat) - # current pickle expects the _compile() reconstructor in re module + # previous pickles may expect the _compile() reconstructor in re module from re import _compile # noqa: F401 def test_copying(self): From 37b210cc8cb71b608907adf2b106cebe0b60571c Mon Sep 17 00:00:00 2001 From: Wim Jeantine-Glenn Date: Sun, 4 Aug 2024 13:27:28 -0500 Subject: [PATCH 2/6] Add news entry --- .../next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst diff --git a/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst b/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst new file mode 100644 index 00000000000000..f2e97c239b0b92 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst @@ -0,0 +1 @@ +Remove re._compile, leaving a compatibility alias to re.compile. From d5bf1954195c78b1f9a5d7d8774f8ac40bd1d118 Mon Sep 17 00:00:00 2001 From: Wim Jeantine-Glenn Date: Wed, 7 Aug 2024 10:13:44 -0500 Subject: [PATCH 3/6] Update Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- .../next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst b/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst index f2e97c239b0b92..2f66bd07c1a9e1 100644 --- a/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst +++ b/Misc/NEWS.d/next/Library/2024-08-04-13-22-32.gh-issue-122358.wpnaq4.rst @@ -1 +1 @@ -Remove re._compile, leaving a compatibility alias to re.compile. +Remove :func:`!re._compile`, leaving a compatibility alias to :func:`re.compile`. From 0dcf82bbe5bfc2167a54944191de496a9a62fe8d Mon Sep 17 00:00:00 2001 From: Wim Jeantine-Glenn Date: Thu, 8 Aug 2024 22:19:52 -0500 Subject: [PATCH 4/6] reduce stacklevel in warnings --- Lib/re/_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 0990255b22c219..2a54b5622a4799 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -557,7 +557,7 @@ def _parse(source, state, verbose, nested, first=False): import warnings warnings.warn( 'Possible nested set at position %d' % source.tell(), - FutureWarning, stacklevel=nested + 6 + FutureWarning, stacklevel=nested + 5 ) negate = sourcematch("^") # check remaining characters @@ -580,7 +580,7 @@ def _parse(source, state, verbose, nested, first=False): 'symmetric difference' if this == '~' else 'union', source.tell() - 1), - FutureWarning, stacklevel=nested + 6 + FutureWarning, stacklevel=nested + 5 ) code1 = LITERAL, _ord(this) if sourcematch("-"): @@ -603,7 +603,7 @@ def _parse(source, state, verbose, nested, first=False): warnings.warn( 'Possible set difference at position %d' % ( source.tell() - 2), - FutureWarning, stacklevel=nested + 6 + FutureWarning, stacklevel=nested + 5 ) code2 = LITERAL, _ord(that) if code1[0] != LITERAL or code2[0] != LITERAL: From df79dd5cc1467f4cfc9e9902d216498e7391f51e Mon Sep 17 00:00:00 2001 From: Wim Jeantine-Glenn Date: Thu, 8 Aug 2024 22:20:36 -0500 Subject: [PATCH 5/6] Add a test for pickles that expect _compile() --- Lib/test/test_re.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 2bdaa7c85fbecb..ae54c73e69d76b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1256,8 +1256,23 @@ def test_pickling(self): pickled = pickle.dumps(oldpat, proto) newpat = pickle.loads(pickled) self.assertEqual(newpat, oldpat) - # previous pickles may expect the _compile() reconstructor in re module + + def test_unpickling(self): + import pickle + pat = re.compile(".*") from re import _compile # noqa: F401 + # previous pickles may expect the _compile() reconstructor in re module. + # the four pickles below are examples of this at various protocol versions. + pickles = [ + b'cre\n_compile\np0\n(V.*\np1\nI32\ntp2\nRp3\n.', + b'cre\n_compile\nq\x00(X\x02\x00\x00\x00.*q\x01K tq\x02Rq\x03.', + b'\x80\x03cre\n_compile\nq\x00X\x02\x00\x00\x00.*q\x01K \x86q\x02Rq\x03.', + b'\x80\x04\x95\x1e\x00\x00\x00\x00\x00\x00\x00\x8c\x02re\x94\x8c\x08' + b'_compile\x94\x93\x94\x8c\x02.*\x94K \x86\x94R\x94.', + ] + for pickled in pickles: + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, pat) def test_copying(self): import copy From 2581e10b68f24c5602d2835f5f5d27628e899c2e Mon Sep 17 00:00:00 2001 From: Wim Glenn Date: Tue, 13 Aug 2024 13:37:40 -0500 Subject: [PATCH 6/6] Skip re pkg frames in warnings This uses the warnings feature skip_file_prefixes added in 3.12 https://github.com/python/cpython/pull/100840/ --- Lib/re/_parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 2a54b5622a4799..0330805e63804d 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -12,6 +12,7 @@ # XXX: show string offset and offending character for all errors +import os from ._constants import * SPECIAL_CHARS = ".\\[{()*+?^$|" @@ -508,6 +509,8 @@ def _parse_sub(source, state, verbose, nested): subpattern.append((BRANCH, (None, items))) return subpattern +_warn_skips = (os.path.dirname(__file__),) + def _parse(source, state, verbose, nested, first=False): # parse a simple pattern subpattern = SubPattern(state) @@ -557,7 +560,7 @@ def _parse(source, state, verbose, nested, first=False): import warnings warnings.warn( 'Possible nested set at position %d' % source.tell(), - FutureWarning, stacklevel=nested + 5 + FutureWarning, skip_file_prefixes=_warn_skips ) negate = sourcematch("^") # check remaining characters @@ -580,7 +583,7 @@ def _parse(source, state, verbose, nested, first=False): 'symmetric difference' if this == '~' else 'union', source.tell() - 1), - FutureWarning, stacklevel=nested + 5 + FutureWarning, skip_file_prefixes=_warn_skips ) code1 = LITERAL, _ord(this) if sourcematch("-"): @@ -603,7 +606,7 @@ def _parse(source, state, verbose, nested, first=False): warnings.warn( 'Possible set difference at position %d' % ( source.tell() - 2), - FutureWarning, stacklevel=nested + 5 + FutureWarning, skip_file_prefixes=_warn_skips ) code2 = LITERAL, _ord(that) if code1[0] != LITERAL or code2[0] != LITERAL: