Skip to content

Commit 49f90ba

Browse files
barneygalehugovkAlexWaygood
authored
pythonGH-73435: Implement recursive wildcards in pathlib.PurePath.match() (python#101398)
`PurePath.match()` now handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
1 parent 4c77061 commit 49f90ba

File tree

5 files changed

+123
-15
lines changed

5 files changed

+123
-15
lines changed

Doc/library/pathlib.rst

+11
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
569569
>>> PurePath('a/b.py').match('/*.py')
570570
False
571571

572+
The *pattern* may be another path object; this speeds up matching the same
573+
pattern against multiple files::
574+
575+
>>> pattern = PurePath('*.py')
576+
>>> PurePath('a/b.py').match(pattern)
577+
True
578+
572579
As with other methods, case-sensitivity follows platform defaults::
573580

574581
>>> PurePosixPath('b.py').match('*.PY')
@@ -581,6 +588,10 @@ Pure paths provide the following methods and properties:
581588
.. versionadded:: 3.12
582589
The *case_sensitive* argument.
583590

591+
.. versionchanged:: 3.13
592+
Support for the recursive wildcard "``**``" was added. In previous
593+
versions, it acted like the non-recursive wildcard "``*``".
594+
584595

585596
.. method:: PurePath.relative_to(other, walk_up=False)
586597

Doc/whatsnew/3.13.rst

+3
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ Improved Modules
9090
pathlib
9191
-------
9292

93+
* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
94+
(Contributed by Barney Gale in :gh:`73435`.)
95+
9396
* Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and
9497
:meth:`~pathlib.Path.rglob`.
9598
(Contributed by Barney Gale in :gh:`77609`.)

Lib/pathlib.py

+85-14
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,30 @@ def _ignore_error(exception):
5454
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
5555

5656

57+
@functools.cache
5758
def _is_case_sensitive(flavour):
5859
return flavour.normcase('Aa') == 'Aa'
5960

6061
#
6162
# Globbing helpers
6263
#
6364

65+
66+
# fnmatch.translate() returns a regular expression that includes a prefix and
67+
# a suffix, which enable matching newlines and ensure the end of the string is
68+
# matched, respectively. These features are undesirable for our implementation
69+
# of PurePatch.match(), which represents path separators as newlines and joins
70+
# pattern segments together. As a workaround, we define a slice object that
71+
# can remove the prefix and suffix from any translate() result. See the
72+
# _compile_pattern_lines() function for more details.
73+
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
74+
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
75+
_SWAP_SEP_AND_NEWLINE = {
76+
'/': str.maketrans({'/': '\n', '\n': '/'}),
77+
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
78+
}
79+
80+
6481
@functools.lru_cache()
6582
def _make_selector(pattern_parts, flavour, case_sensitive):
6683
pat = pattern_parts[0]
@@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive):
92109
return re.compile(fnmatch.translate(pat), flags).match
93110

94111

112+
@functools.lru_cache()
113+
def _compile_pattern_lines(pattern_lines, case_sensitive):
114+
"""Compile the given pattern lines to an `re.Pattern` object.
115+
116+
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
117+
its path separators and newlines swapped (e.g. '**\n*.py`). By using
118+
newlines to separate path components, and not setting `re.DOTALL`, we
119+
ensure that the `*` wildcard cannot match path separators.
120+
121+
The returned `re.Pattern` object may have its `match()` method called to
122+
match a complete pattern, or `search()` to match from the right. The
123+
argument supplied to these methods must also have its path separators and
124+
newlines swapped.
125+
"""
126+
127+
# Match the start of the path, or just after a path separator
128+
parts = ['^']
129+
for part in pattern_lines.splitlines(keepends=True):
130+
if part == '**\n':
131+
# '**/' component: we use '[\s\S]' rather than '.' so that path
132+
# separators (i.e. newlines) are matched. The trailing '^' ensures
133+
# we terminate after a path separator (i.e. on a new line).
134+
part = r'[\s\S]*^'
135+
elif part == '**':
136+
# '**' component.
137+
part = r'[\s\S]*'
138+
elif '**' in part:
139+
raise ValueError("Invalid pattern: '**' can only be an entire path component")
140+
else:
141+
# Any other component: pass to fnmatch.translate(). We slice off
142+
# the common prefix and suffix added by translate() to ensure that
143+
# re.DOTALL is not set, and the end of the string not matched,
144+
# respectively. With DOTALL not set, '*' wildcards will not match
145+
# path separators, because the '.' characters in the pattern will
146+
# not match newlines.
147+
part = fnmatch.translate(part)[_FNMATCH_SLICE]
148+
parts.append(part)
149+
# Match the end of the path, always.
150+
parts.append(r'\Z')
151+
flags = re.MULTILINE
152+
if not case_sensitive:
153+
flags |= re.IGNORECASE
154+
return re.compile(''.join(parts), flags=flags)
155+
156+
95157
class _Selector:
96158
"""A selector matches a specific glob pattern part against the children
97159
of a given path."""
@@ -276,6 +338,10 @@ class PurePath:
276338
# to implement comparison methods like `__lt__()`.
277339
'_parts_normcase_cached',
278340

341+
# The `_lines_cached` slot stores the string path with path separators
342+
# and newlines swapped. This is used to implement `match()`.
343+
'_lines_cached',
344+
279345
# The `_hash` slot stores the hash of the case-normalized string
280346
# path. It's set when `__hash__()` is called for the first time.
281347
'_hash',
@@ -441,6 +507,16 @@ def _parts_normcase(self):
441507
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
442508
return self._parts_normcase_cached
443509

510+
@property
511+
def _lines(self):
512+
# Path with separators and newlines swapped, for pattern matching.
513+
try:
514+
return self._lines_cached
515+
except AttributeError:
516+
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
517+
self._lines_cached = str(self).translate(trans)
518+
return self._lines_cached
519+
444520
def __eq__(self, other):
445521
if not isinstance(other, PurePath):
446522
return NotImplemented
@@ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None):
697773
"""
698774
Return True if this path matches the given pattern.
699775
"""
776+
if not isinstance(path_pattern, PurePath):
777+
path_pattern = self.with_segments(path_pattern)
700778
if case_sensitive is None:
701779
case_sensitive = _is_case_sensitive(self._flavour)
702-
pat = self.with_segments(path_pattern)
703-
if not pat.parts:
780+
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
781+
if path_pattern.drive or path_pattern.root:
782+
return pattern.match(self._lines) is not None
783+
elif path_pattern._tail:
784+
return pattern.search(self._lines) is not None
785+
else:
704786
raise ValueError("empty pattern")
705-
pat_parts = pat.parts
706-
parts = self.parts
707-
if pat.drive or pat.root:
708-
if len(pat_parts) != len(parts):
709-
return False
710-
elif len(pat_parts) > len(parts):
711-
return False
712-
for part, pat in zip(reversed(parts), reversed(pat_parts)):
713-
match = _compile_pattern(pat, case_sensitive)
714-
if not match(part):
715-
return False
716-
return True
787+
717788

718789
# Subclassing os.PathLike makes isinstance() checks slower,
719790
# which in turn makes Path construction slower. Register instead!

Lib/test/test_pathlib.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -310,8 +310,30 @@ def test_match_common(self):
310310
self.assertFalse(P('/ab.py').match('/a/*.py'))
311311
self.assertFalse(P('/a/b/c.py').match('/a/*.py'))
312312
# Multi-part glob-style pattern.
313-
self.assertFalse(P('/a/b/c.py').match('/**/*.py'))
313+
self.assertTrue(P('a').match('**'))
314+
self.assertTrue(P('c.py').match('**'))
315+
self.assertTrue(P('a/b/c.py').match('**'))
316+
self.assertTrue(P('/a/b/c.py').match('**'))
317+
self.assertTrue(P('/a/b/c.py').match('/**'))
318+
self.assertTrue(P('/a/b/c.py').match('**/'))
319+
self.assertTrue(P('/a/b/c.py').match('/a/**'))
320+
self.assertTrue(P('/a/b/c.py').match('**/*.py'))
321+
self.assertTrue(P('/a/b/c.py').match('/**/*.py'))
314322
self.assertTrue(P('/a/b/c.py').match('/a/**/*.py'))
323+
self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py'))
324+
self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py'))
325+
self.assertFalse(P('c.py').match('**/a.py'))
326+
self.assertFalse(P('c.py').match('c/**'))
327+
self.assertFalse(P('a/b/c.py').match('**/a'))
328+
self.assertFalse(P('a/b/c.py').match('**/a/b'))
329+
self.assertFalse(P('a/b/c.py').match('**/a/b/c'))
330+
self.assertFalse(P('a/b/c.py').match('**/a/b/c.'))
331+
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
332+
self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
333+
self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**'))
334+
self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py'))
335+
self.assertRaises(ValueError, P('a').match, '**a/b/c')
336+
self.assertRaises(ValueError, P('a').match, 'a/b/c**')
315337
# Case-sensitive flag
316338
self.assertFalse(P('A.py').match('a.PY', case_sensitive=True))
317339
self.assertTrue(P('A.py').match('a.PY', case_sensitive=False))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.

0 commit comments

Comments
 (0)