Skip to content

Commit 24af451

Browse files
authored
GH-102613: Fast recursive globbing in pathlib.Path.glob() (GH-104512)
This commit introduces a 'walk-and-match' strategy for handling glob patterns that include a non-terminal `**` wildcard, such as `**/*.py`. For this example, the previous implementation recursively walked directories using `os.scandir()` when it expanded the `**` component, and then **scanned those same directories again** when expanded the `*.py` component. This is wasteful. In the new implementation, any components following a `**` wildcard are used to build a `re.Pattern` object, which is used to filter the results of the recursive walk. A pattern like `**/*.py` uses half the number of `os.scandir()` calls; a pattern like `**/*/*.py` a third, etc. This new algorithm does not apply if either: 1. The *follow_symlinks* argument is set to `None` (its default), or 2. The pattern contains `..` components. In these cases we fall back to the old implementation. This commit also replaces selector classes with selector functions. These generators directly yield results rather calling through to their successors. A new internal `Path._glob()` method takes care to chain these generators together, which simplifies the lazy algorithm and slightly improves performance. It should also be easier to understand and maintain.
1 parent 2587b9f commit 24af451

File tree

4 files changed

+163
-140
lines changed

4 files changed

+163
-140
lines changed

Doc/library/pathlib.rst

+8-4
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,14 @@ call fails (for example because the path doesn't exist).
917917
PosixPath('setup.py'),
918918
PosixPath('test_pathlib.py')]
919919

920+
.. note::
921+
Using the "``**``" pattern in large directory trees may consume
922+
an inordinate amount of time.
923+
924+
.. tip::
925+
Set *follow_symlinks* to ``True`` or ``False`` to improve performance
926+
of recursive globbing.
927+
920928
By default, or when the *case_sensitive* keyword-only argument is set to
921929
``None``, this method matches paths using platform-specific casing rules:
922930
typically, case-sensitive on POSIX, and case-insensitive on Windows.
@@ -927,10 +935,6 @@ call fails (for example because the path doesn't exist).
927935
wildcards. Set *follow_symlinks* to ``True`` to always follow symlinks, or
928936
``False`` to treat all symlinks as files.
929937

930-
.. note::
931-
Using the "``**``" pattern in large directory trees may consume
932-
an inordinate amount of time.
933-
934938
.. audit-event:: pathlib.Path.glob self,pattern pathlib.Path.glob
935939

936940
.. versionchanged:: 3.11

Lib/pathlib.py

+133-136
Original file line numberDiff line numberDiff line change
@@ -78,33 +78,12 @@ def _is_case_sensitive(flavour):
7878
}
7979

8080

81-
@functools.lru_cache()
82-
def _make_selector(pattern_parts, flavour, case_sensitive):
83-
pat = pattern_parts[0]
84-
if not pat:
85-
return _TerminatingSelector()
86-
if pat == '**':
87-
child_parts_idx = 1
88-
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
89-
child_parts_idx += 1
90-
child_parts = pattern_parts[child_parts_idx:]
91-
if '**' in child_parts:
92-
cls = _DoubleRecursiveWildcardSelector
93-
else:
94-
cls = _RecursiveWildcardSelector
95-
else:
96-
child_parts = pattern_parts[1:]
97-
if pat == '..':
98-
cls = _ParentSelector
99-
elif '**' in pat:
100-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
101-
else:
102-
cls = _WildcardSelector
103-
return cls(pat, child_parts, flavour, case_sensitive)
104-
105-
10681
@functools.lru_cache(maxsize=256)
10782
def _compile_pattern(pat, case_sensitive):
83+
"""Compile given glob pattern to a re.Pattern object (observing case
84+
sensitivity), or None if the pattern should match everything."""
85+
if pat == '*':
86+
return None
10887
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
10988
return re.compile(fnmatch.translate(pat), flags).match
11089

@@ -127,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
127106
# Match the start of the path, or just after a path separator
128107
parts = ['^']
129108
for part in pattern_lines.splitlines(keepends=True):
130-
if part == '**\n':
109+
if part == '*\n':
110+
part = r'.+\n'
111+
elif part == '*':
112+
part = r'.+'
113+
elif part == '**\n':
131114
# '**/' component: we use '[\s\S]' rather than '.' so that path
132115
# separators (i.e. newlines) are matched. The trailing '^' ensures
133116
# we terminate after a path separator (i.e. on a new line).
@@ -154,114 +137,70 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
154137
return re.compile(''.join(parts), flags=flags)
155138

156139

157-
class _Selector:
158-
"""A selector matches a specific glob pattern part against the children
159-
of a given path."""
160-
161-
def __init__(self, child_parts, flavour, case_sensitive):
162-
self.child_parts = child_parts
163-
if child_parts:
164-
self.successor = _make_selector(child_parts, flavour, case_sensitive)
165-
self.dironly = True
166-
else:
167-
self.successor = _TerminatingSelector()
168-
self.dironly = False
169-
170-
def select_from(self, parent_path, follow_symlinks):
171-
"""Iterate over all child paths of `parent_path` matched by this
172-
selector. This can contain parent_path itself."""
173-
path_cls = type(parent_path)
174-
scandir = path_cls._scandir
175-
if not parent_path.is_dir():
176-
return iter([])
177-
return self._select_from(parent_path, scandir, follow_symlinks)
178-
179-
180-
class _TerminatingSelector:
181-
182-
def _select_from(self, parent_path, scandir, follow_symlinks):
183-
yield parent_path
184-
185-
186-
class _ParentSelector(_Selector):
187-
188-
def __init__(self, name, child_parts, flavour, case_sensitive):
189-
_Selector.__init__(self, child_parts, flavour, case_sensitive)
190-
191-
def _select_from(self, parent_path, scandir, follow_symlinks):
192-
path = parent_path._make_child_relpath('..')
193-
for p in self.successor._select_from(path, scandir, follow_symlinks):
194-
yield p
195-
196-
197-
class _WildcardSelector(_Selector):
198-
199-
def __init__(self, pat, child_parts, flavour, case_sensitive):
200-
_Selector.__init__(self, child_parts, flavour, case_sensitive)
201-
if case_sensitive is None:
202-
# TODO: evaluate case-sensitivity of each directory in _select_from()
203-
case_sensitive = _is_case_sensitive(flavour)
204-
self.match = _compile_pattern(pat, case_sensitive)
205-
206-
def _select_from(self, parent_path, scandir, follow_symlinks):
207-
follow_dirlinks = True if follow_symlinks is None else follow_symlinks
140+
def _select_children(parent_paths, dir_only, follow_symlinks, match):
141+
"""Yield direct children of given paths, filtering by name and type."""
142+
if follow_symlinks is None:
143+
follow_symlinks = True
144+
for parent_path in parent_paths:
208145
try:
209146
# We must close the scandir() object before proceeding to
210147
# avoid exhausting file descriptors when globbing deep trees.
211-
with scandir(parent_path) as scandir_it:
148+
with parent_path._scandir() as scandir_it:
212149
entries = list(scandir_it)
213150
except OSError:
214151
pass
215152
else:
216153
for entry in entries:
217-
if self.dironly:
154+
if dir_only:
218155
try:
219-
if not entry.is_dir(follow_symlinks=follow_dirlinks):
156+
if not entry.is_dir(follow_symlinks=follow_symlinks):
220157
continue
221158
except OSError:
222159
continue
223160
name = entry.name
224-
if self.match(name):
225-
path = parent_path._make_child_relpath(name)
226-
for p in self.successor._select_from(path, scandir, follow_symlinks):
227-
yield p
228-
161+
if match is None or match(name):
162+
yield parent_path._make_child_relpath(name)
229163

230-
class _RecursiveWildcardSelector(_Selector):
231-
232-
def __init__(self, pat, child_parts, flavour, case_sensitive):
233-
_Selector.__init__(self, child_parts, flavour, case_sensitive)
234-
235-
def _iterate_directories(self, parent_path, follow_symlinks):
236-
yield parent_path
237-
for dirpath, dirnames, _ in parent_path.walk(follow_symlinks=follow_symlinks):
238-
for dirname in dirnames:
239-
yield dirpath._make_child_relpath(dirname)
240-
241-
def _select_from(self, parent_path, scandir, follow_symlinks):
242-
follow_dirlinks = False if follow_symlinks is None else follow_symlinks
243-
successor_select = self.successor._select_from
244-
for starting_point in self._iterate_directories(parent_path, follow_dirlinks):
245-
for p in successor_select(starting_point, scandir, follow_symlinks):
246-
yield p
247-
248-
249-
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
250-
"""
251-
Like _RecursiveWildcardSelector, but also de-duplicates results from
252-
successive selectors. This is necessary if the pattern contains
253-
multiple non-adjacent '**' segments.
254-
"""
255164

256-
def _select_from(self, parent_path, scandir, follow_symlinks):
257-
yielded = set()
258-
try:
259-
for p in super()._select_from(parent_path, scandir, follow_symlinks):
260-
if p not in yielded:
261-
yield p
262-
yielded.add(p)
263-
finally:
264-
yielded.clear()
165+
def _select_recursive(parent_paths, dir_only, follow_symlinks):
166+
"""Yield given paths and all their subdirectories, recursively."""
167+
if follow_symlinks is None:
168+
follow_symlinks = False
169+
for parent_path in parent_paths:
170+
paths = [parent_path]
171+
while paths:
172+
path = paths.pop()
173+
yield path
174+
try:
175+
# We must close the scandir() object before proceeding to
176+
# avoid exhausting file descriptors when globbing deep trees.
177+
with path._scandir() as scandir_it:
178+
entries = list(scandir_it)
179+
except OSError:
180+
pass
181+
else:
182+
for entry in entries:
183+
try:
184+
if entry.is_dir(follow_symlinks=follow_symlinks):
185+
paths.append(path._make_child_relpath(entry.name))
186+
continue
187+
except OSError:
188+
pass
189+
if not dir_only:
190+
yield path._make_child_relpath(entry.name)
191+
192+
193+
def _select_unique(paths):
194+
"""Yields the given paths, filtering out duplicates."""
195+
yielded = set()
196+
try:
197+
for path in paths:
198+
raw_path = path._raw_path
199+
if raw_path not in yielded:
200+
yield path
201+
yielded.add(raw_path)
202+
finally:
203+
yielded.clear()
265204

266205

267206
#
@@ -1056,51 +995,109 @@ def _scandir(self):
1056995
return os.scandir(self)
1057996

1058997
def _make_child_relpath(self, name):
998+
sep = self._flavour.sep
999+
lines_name = name.replace('\n', sep)
1000+
lines_str = self._lines
10591001
path_str = str(self)
10601002
tail = self._tail
10611003
if tail:
1062-
path_str = f'{path_str}{self._flavour.sep}{name}'
1004+
path_str = f'{path_str}{sep}{name}'
1005+
lines_str = f'{lines_str}\n{lines_name}'
10631006
elif path_str != '.':
10641007
path_str = f'{path_str}{name}'
1008+
lines_str = f'{lines_str}{lines_name}'
10651009
else:
10661010
path_str = name
1011+
lines_str = lines_name
10671012
path = self.with_segments(path_str)
10681013
path._str = path_str
10691014
path._drv = self.drive
10701015
path._root = self.root
10711016
path._tail_cached = tail + [name]
1017+
path._lines_cached = lines_str
10721018
return path
10731019

10741020
def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
10751021
"""Iterate over this subtree and yield all existing files (of any
10761022
kind, including directories) matching the given relative pattern.
10771023
"""
10781024
sys.audit("pathlib.Path.glob", self, pattern)
1079-
if not pattern:
1080-
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
1081-
drv, root, pattern_parts = self._parse_path(pattern)
1082-
if drv or root:
1083-
raise NotImplementedError("Non-relative patterns are unsupported")
1084-
if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
1085-
pattern_parts.append('')
1086-
selector = _make_selector(tuple(pattern_parts), self._flavour, case_sensitive)
1087-
for p in selector.select_from(self, follow_symlinks):
1088-
yield p
1025+
return self._glob(pattern, case_sensitive, follow_symlinks)
10891026

10901027
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
10911028
"""Recursively yield all existing files (of any kind, including
10921029
directories) matching the given relative pattern, anywhere in
10931030
this subtree.
10941031
"""
10951032
sys.audit("pathlib.Path.rglob", self, pattern)
1096-
drv, root, pattern_parts = self._parse_path(pattern)
1097-
if drv or root:
1033+
return self._glob(f'**/{pattern}', case_sensitive, follow_symlinks)
1034+
1035+
def _glob(self, pattern, case_sensitive, follow_symlinks):
1036+
path_pattern = self.with_segments(pattern)
1037+
if path_pattern.drive or path_pattern.root:
10981038
raise NotImplementedError("Non-relative patterns are unsupported")
1099-
if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep):
1039+
elif not path_pattern._tail:
1040+
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
1041+
1042+
pattern_parts = list(path_pattern._tail)
1043+
if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
1044+
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
11001045
pattern_parts.append('')
1101-
selector = _make_selector(("**",) + tuple(pattern_parts), self._flavour, case_sensitive)
1102-
for p in selector.select_from(self, follow_symlinks):
1103-
yield p
1046+
if pattern_parts[-1] == '**':
1047+
# GH-70303: '**' only matches directories. Add trailing slash.
1048+
pattern_parts.append('')
1049+
1050+
if case_sensitive is None:
1051+
# TODO: evaluate case-sensitivity of each directory in _select_children().
1052+
case_sensitive = _is_case_sensitive(self._flavour)
1053+
1054+
# If symlinks are handled consistently, and the pattern does not
1055+
# contain '..' components, then we can use a 'walk-and-match' strategy
1056+
# when expanding '**' wildcards. When a '**' wildcard is encountered,
1057+
# all following pattern parts are immediately consumed and used to
1058+
# build a `re.Pattern` object. This pattern is used to filter the
1059+
# recursive walk. As a result, pattern parts following a '**' wildcard
1060+
# do not perform any filesystem access, which can be much faster!
1061+
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
1062+
deduplicate_paths = False
1063+
paths = iter([self] if self.is_dir() else [])
1064+
part_idx = 0
1065+
while part_idx < len(pattern_parts):
1066+
part = pattern_parts[part_idx]
1067+
part_idx += 1
1068+
if part == '':
1069+
# Trailing slash.
1070+
pass
1071+
elif part == '..':
1072+
paths = (path._make_child_relpath('..') for path in paths)
1073+
elif part == '**':
1074+
# Consume adjacent '**' components.
1075+
while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
1076+
part_idx += 1
1077+
1078+
if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
1079+
dir_only = pattern_parts[-1] == ''
1080+
paths = _select_recursive(paths, dir_only, follow_symlinks)
1081+
1082+
# Filter out paths that don't match pattern.
1083+
prefix_len = len(self._make_child_relpath('_')._lines) - 1
1084+
match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
1085+
paths = (path for path in paths if match(path._lines[prefix_len:]))
1086+
return paths
1087+
1088+
dir_only = part_idx < len(pattern_parts)
1089+
paths = _select_recursive(paths, dir_only, follow_symlinks)
1090+
if deduplicate_paths:
1091+
# De-duplicate if we've already seen a '**' component.
1092+
paths = _select_unique(paths)
1093+
deduplicate_paths = True
1094+
elif '**' in part:
1095+
raise ValueError("Invalid pattern: '**' can only be an entire path component")
1096+
else:
1097+
dir_only = part_idx < len(pattern_parts)
1098+
match = _compile_pattern(part, case_sensitive)
1099+
paths = _select_children(paths, dir_only, follow_symlinks, match)
1100+
return paths
11041101

11051102
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
11061103
"""Walk the directory tree from this directory, similar to os.walk()."""

Lib/test/test_pathlib.py

+18
Original file line numberDiff line numberDiff line change
@@ -1898,6 +1898,16 @@ def _check(path, glob, expected):
18981898
_check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"])
18991899
_check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"])
19001900
_check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"])
1901+
_check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."])
1902+
_check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD",
1903+
"dirC", "dirC/dirD", "dirE"])
1904+
_check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..",
1905+
"dirC/..", "dirC/dirD/..", "dirE/.."])
1906+
_check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"])
1907+
_check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."])
1908+
_check(p, "dir*/**/fileC", ["dirC/fileC"])
1909+
_check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
1910+
_check(p, "*/dirD/**/", ["dirC/dirD"])
19011911

19021912
@os_helper.skip_unless_symlink
19031913
def test_glob_no_follow_symlinks_common(self):
@@ -1912,6 +1922,14 @@ def _check(path, glob, expected):
19121922
_check(p, "*B/*", ["dirB/fileB", "dirB/linkD"])
19131923
_check(p, "*/fileB", ["dirB/fileB"])
19141924
_check(p, "*/", ["dirA", "dirB", "dirC", "dirE"])
1925+
_check(p, "dir*/*/..", ["dirC/dirD/.."])
1926+
_check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"])
1927+
_check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."])
1928+
_check(p, "dir*/*/**/", ["dirC/dirD"])
1929+
_check(p, "dir*/*/**/..", ["dirC/dirD/.."])
1930+
_check(p, "dir*/**/fileC", ["dirC/fileC"])
1931+
_check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
1932+
_check(p, "*/dirD/**/", ["dirC/dirD"])
19151933

19161934
def test_rglob_common(self):
19171935
def _check(glob, expected):

0 commit comments

Comments
 (0)