Skip to content

Commit 6f93b4d

Browse files
authored
GH-115060: Speed up pathlib.Path.glob() by removing redundant regex matching (#115061)
When expanding and filtering paths for a `**` wildcard segment, build an `re.Pattern` object from the subsequent pattern parts, rather than the entire pattern, and match against the `os.DirEntry` object prior to instantiating a path object. Also skip compiling a pattern when expanding a `*` wildcard segment.
1 parent 9d1a353 commit 6f93b4d

File tree

4 files changed

+76
-28
lines changed

4 files changed

+76
-28
lines changed

Lib/pathlib/__init__.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -587,9 +587,13 @@ def iterdir(self):
587587
def _scandir(self):
588588
return os.scandir(self)
589589

590-
def _make_child_entry(self, entry):
590+
def _direntry_str(self, entry):
591+
# Transform an entry yielded from _scandir() into a path string.
592+
return entry.name if str(self) == '.' else entry.path
593+
594+
def _make_child_direntry(self, entry):
591595
# Transform an entry yielded from _scandir() into a path object.
592-
path_str = entry.name if str(self) == '.' else entry.path
596+
path_str = self._direntry_str(entry)
593597
path = self.with_segments(path_str)
594598
path._str = path_str
595599
path._drv = self.drive

Lib/pathlib/_abc.py

+56-26
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,29 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
8686
continue
8787
except OSError:
8888
continue
89-
if match(entry.name):
90-
yield parent_path._make_child_entry(entry)
89+
# Avoid cost of making a path object for non-matching paths by
90+
# matching against the os.DirEntry.name string.
91+
if match is None or match(entry.name):
92+
yield parent_path._make_child_direntry(entry)
9193

9294

93-
def _select_recursive(parent_paths, dir_only, follow_symlinks):
94-
"""Yield given paths and all their subdirectories, recursively."""
95+
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
96+
"""Yield given paths and all their children, recursively, filtering by
97+
string and type.
98+
"""
9599
if follow_symlinks is None:
96100
follow_symlinks = False
97101
for parent_path in parent_paths:
102+
if match is not None:
103+
# If we're filtering paths through a regex, record the length of
104+
# the parent path. We'll pass it to match(path, pos=...) later.
105+
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
98106
paths = [parent_path._make_child_relpath('')]
99107
while paths:
100108
path = paths.pop()
101-
yield path
109+
if match is None or match(str(path), parent_len):
110+
# Yield *directory* path that matches pattern (if any).
111+
yield path
102112
try:
103113
# We must close the scandir() object before proceeding to
104114
# avoid exhausting file descriptors when globbing deep trees.
@@ -108,14 +118,22 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
108118
pass
109119
else:
110120
for entry in entries:
121+
# Handle directory entry.
111122
try:
112123
if entry.is_dir(follow_symlinks=follow_symlinks):
113-
paths.append(path._make_child_entry(entry))
124+
# Recurse into this directory.
125+
paths.append(path._make_child_direntry(entry))
114126
continue
115127
except OSError:
116128
pass
129+
130+
# Handle file entry.
117131
if not dir_only:
118-
yield path._make_child_entry(entry)
132+
# Avoid cost of making a path object for non-matching
133+
# files by matching against the os.DirEntry object.
134+
if match is None or match(path._direntry_str(entry), parent_len):
135+
# Yield *file* path that matches pattern (if any).
136+
yield path._make_child_direntry(entry)
119137

120138

121139
def _select_unique(paths):
@@ -750,8 +768,14 @@ def _scandir(self):
750768
from contextlib import nullcontext
751769
return nullcontext(self.iterdir())
752770

753-
def _make_child_entry(self, entry):
771+
def _direntry_str(self, entry):
772+
# Transform an entry yielded from _scandir() into a path string.
773+
# PathBase._scandir() yields PathBase objects, so use str().
774+
return str(entry)
775+
776+
def _make_child_direntry(self, entry):
754777
# Transform an entry yielded from _scandir() into a path object.
778+
# PathBase._scandir() yields PathBase objects, so this is a no-op.
755779
return entry
756780

757781
def _make_child_relpath(self, name):
@@ -769,43 +793,49 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
769793

770794
stack = pattern._pattern_stack
771795
specials = ('', '.', '..')
772-
filter_paths = False
773796
deduplicate_paths = False
774797
sep = self.pathmod.sep
775798
paths = iter([self] if self.is_dir() else [])
776799
while stack:
777800
part = stack.pop()
778801
if part in specials:
802+
# Join special component (e.g. '..') onto paths.
779803
paths = _select_special(paths, part)
804+
780805
elif part == '**':
781-
# Consume adjacent '**' components.
806+
# Consume following '**' components, which have no effect.
782807
while stack and stack[-1] == '**':
783808
stack.pop()
784809

785-
# Consume adjacent non-special components and enable post-walk
786-
# regex filtering, provided we're treating symlinks consistently.
810+
# Consume following non-special components, provided we're
811+
# treating symlinks consistently. Each component is joined
812+
# onto 'part', which is used to generate an re.Pattern object.
787813
if follow_symlinks is not None:
788814
while stack and stack[-1] not in specials:
789-
filter_paths = True
790-
stack.pop()
815+
part += sep + stack.pop()
791816

792-
dir_only = bool(stack)
793-
paths = _select_recursive(paths, dir_only, follow_symlinks)
817+
# If the previous loop consumed pattern components, compile an
818+
# re.Pattern object based on those components.
819+
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
820+
821+
# Recursively walk directories, filtering by type and regex.
822+
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)
823+
824+
# De-duplicate if we've already seen a '**' component.
794825
if deduplicate_paths:
795-
# De-duplicate if we've already seen a '**' component.
796826
paths = _select_unique(paths)
797827
deduplicate_paths = True
828+
798829
elif '**' in part:
799830
raise ValueError("Invalid pattern: '**' can only be an entire path component")
831+
800832
else:
801-
dir_only = bool(stack)
802-
match = _compile_pattern(part, sep, case_sensitive)
803-
paths = _select_children(paths, dir_only, follow_symlinks, match)
804-
if filter_paths:
805-
# Filter out paths that don't match pattern.
806-
prefix_len = len(str(self._make_child_relpath('_'))) - 1
807-
match = _compile_pattern(pattern._pattern_str, sep, case_sensitive)
808-
paths = (path for path in paths if match(path._pattern_str, prefix_len))
833+
# If the pattern component isn't '*', compile an re.Pattern
834+
# object based on the component.
835+
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
836+
837+
# Iterate over directories' children filtering by type and regex.
838+
paths = _select_children(paths, bool(stack), follow_symlinks, match)
809839
return paths
810840

811841
def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -854,7 +884,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False):
854884

855885
if is_dir:
856886
if not top_down:
857-
paths.append(path._make_child_entry(entry))
887+
paths.append(path._make_child_direntry(entry))
858888
dirnames.append(entry.name)
859889
else:
860890
filenames.append(entry.name)

Lib/test/test_pathlib/test_pathlib.py

+13
Original file line numberDiff line numberDiff line change
@@ -1250,6 +1250,19 @@ def test_glob_pathlike(self):
12501250
self.assertEqual(expect, set(p.glob(P(pattern))))
12511251
self.assertEqual(expect, set(p.glob(FakePath(pattern))))
12521252

1253+
@needs_symlinks
1254+
def test_glob_dot(self):
1255+
P = self.cls
1256+
with os_helper.change_cwd(P(self.base, "dirC")):
1257+
self.assertEqual(
1258+
set(P('.').glob('*')), {P("fileC"), P("novel.txt"), P("dirD")})
1259+
self.assertEqual(
1260+
set(P('.').glob('**')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD"), P(".")})
1261+
self.assertEqual(
1262+
set(P('.').glob('**/*')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD")})
1263+
self.assertEqual(
1264+
set(P('.').glob('**/*/*')), {P("dirD/fileD")})
1265+
12531266
def test_rglob_pathlike(self):
12541267
P = self.cls
12551268
p = P(self.base, "dirC")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching.

0 commit comments

Comments
 (0)