Skip to content

Commit e7cb216

Browse files
barneygalehugovkAlexWaygood
authored
[3.12] GH-105113: Improve performance of pathlib.PurePath.match() (GH-105114)
We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
1 parent 076f3cd commit e7cb216

File tree

3 files changed

+81
-14
lines changed

3 files changed

+81
-14
lines changed

Doc/library/pathlib.rst

+7
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
569569
>>> PurePath('a/b.py').match('/*.py')
570570
False
571571

572+
The *pattern* may be another path object; this speeds up matching the same
573+
pattern against multiple files::
574+
575+
>>> pattern = PurePath('*.py')
576+
>>> PurePath('a/b.py').match(pattern)
577+
True
578+
572579
As with other methods, case-sensitivity follows platform defaults::
573580

574581
>>> PurePosixPath('b.py').match('*.PY')

Lib/pathlib.py

+72-14
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,30 @@ def _ignore_error(exception):
5454
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
5555

5656

57+
@functools.cache
5758
def _is_case_sensitive(flavour):
5859
return flavour.normcase('Aa') == 'Aa'
5960

6061
#
6162
# Globbing helpers
6263
#
6364

65+
66+
# fnmatch.translate() returns a regular expression that includes a prefix and
67+
# a suffix, which enable matching newlines and ensure the end of the string is
68+
# matched, respectively. These features are undesirable for our implementation
69+
# of PurePatch.match(), which represents path separators as newlines and joins
70+
# pattern segments together. As a workaround, we define a slice object that
71+
# can remove the prefix and suffix from any translate() result. See the
72+
# _compile_pattern_lines() function for more details.
73+
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
74+
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
75+
_SWAP_SEP_AND_NEWLINE = {
76+
'/': str.maketrans({'/': '\n', '\n': '/'}),
77+
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
78+
}
79+
80+
6481
@functools.lru_cache()
6582
def _make_selector(pattern_parts, flavour, case_sensitive):
6683
pat = pattern_parts[0]
@@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive):
92109
return re.compile(fnmatch.translate(pat), flags).match
93110

94111

112+
@functools.lru_cache()
113+
def _compile_pattern_lines(pattern_lines, case_sensitive):
114+
"""Compile the given pattern lines to an `re.Pattern` object.
115+
116+
The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with
117+
its path separators and newlines swapped (e.g. '*\n*.py`). By using
118+
newlines to separate path components, and not setting `re.DOTALL`, we
119+
ensure that the `*` wildcard cannot match path separators.
120+
121+
The returned `re.Pattern` object may have its `match()` method called to
122+
match a complete pattern, or `search()` to match from the right. The
123+
argument supplied to these methods must also have its path separators and
124+
newlines swapped.
125+
"""
126+
127+
# Match the start of the path, or just after a path separator
128+
parts = ['^']
129+
for part in pattern_lines.splitlines(keepends=True):
130+
# We slice off the common prefix and suffix added by translate() to
131+
# ensure that re.DOTALL is not set, and the end of the string not
132+
# matched, respectively. With DOTALL not set, '*' wildcards will not
133+
# match path separators, because the '.' characters in the pattern
134+
# will not match newlines.
135+
parts.append(fnmatch.translate(part)[_FNMATCH_SLICE])
136+
# Match the end of the path, always.
137+
parts.append(r'\Z')
138+
flags = re.MULTILINE
139+
if not case_sensitive:
140+
flags |= re.IGNORECASE
141+
return re.compile(''.join(parts), flags=flags)
142+
143+
95144
class _Selector:
96145
"""A selector matches a specific glob pattern part against the children
97146
of a given path."""
@@ -274,6 +323,10 @@ class PurePath(object):
274323
# to implement comparison methods like `__lt__()`.
275324
'_parts_normcase_cached',
276325

326+
# The `_lines_cached` slot stores the string path with path separators
327+
# and newlines swapped. This is used to implement `match()`.
328+
'_lines_cached',
329+
277330
# The `_hash` slot stores the hash of the case-normalized string
278331
# path. It's set when `__hash__()` is called for the first time.
279332
'_hash',
@@ -439,6 +492,16 @@ def _parts_normcase(self):
439492
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
440493
return self._parts_normcase_cached
441494

495+
@property
496+
def _lines(self):
497+
# Path with separators and newlines swapped, for pattern matching.
498+
try:
499+
return self._lines_cached
500+
except AttributeError:
501+
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
502+
self._lines_cached = str(self).translate(trans)
503+
return self._lines_cached
504+
442505
def __eq__(self, other):
443506
if not isinstance(other, PurePath):
444507
return NotImplemented
@@ -695,23 +758,18 @@ def match(self, path_pattern, *, case_sensitive=None):
695758
"""
696759
Return True if this path matches the given pattern.
697760
"""
761+
if not isinstance(path_pattern, PurePath):
762+
path_pattern = self.with_segments(path_pattern)
698763
if case_sensitive is None:
699764
case_sensitive = _is_case_sensitive(self._flavour)
700-
pat = self.with_segments(path_pattern)
701-
if not pat.parts:
765+
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
766+
if path_pattern.drive or path_pattern.root:
767+
return pattern.match(self._lines) is not None
768+
elif path_pattern._tail:
769+
return pattern.search(self._lines) is not None
770+
else:
702771
raise ValueError("empty pattern")
703-
pat_parts = pat.parts
704-
parts = self.parts
705-
if pat.drive or pat.root:
706-
if len(pat_parts) != len(parts):
707-
return False
708-
elif len(pat_parts) > len(parts):
709-
return False
710-
for part, pat in zip(reversed(parts), reversed(pat_parts)):
711-
match = _compile_pattern(pat, case_sensitive)
712-
if not match(part):
713-
return False
714-
return True
772+
715773

716774
# Can't subclass os.PathLike from PurePath and keep the constructor
717775
# optimizations in PurePath.__slots__.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improve performance of :meth:`pathlib.PurePath.match` by compiling an
2+
:class:`re.Pattern` object for the entire pattern.

0 commit comments

Comments
 (0)