Skip to content

Commit fe2c46c

Browse files
committed
pythonGH-117586: Speed up pathlib.Path.glob() by working with strings
Move pathlib globbing implementation to a new module and class: `pathlib._glob.Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects. In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`. This sets the stage for two more improvements: - pythonGH-115060: Query non-wildcard segments with `lstat()` - pythonGH-116380: Move `pathlib._glob` to `glob` (unify implementations).
1 parent 6150bb2 commit fe2c46c

File tree

4 files changed

+242
-186
lines changed

4 files changed

+242
-186
lines changed

Lib/pathlib/__init__.py

+16-25
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
except ImportError:
2424
grp = None
2525

26-
from . import _abc
26+
from . import _abc, _glob
2727

2828

2929
__all__ = [
@@ -111,6 +111,7 @@ class PurePath(_abc.PurePathBase):
111111
'_hash',
112112
)
113113
parser = os.path
114+
_globber = _glob.Globber
114115

115116
def __new__(cls, *args, **kwargs):
116117
"""Construct a PurePath from one or several strings and or existing
@@ -453,21 +454,6 @@ def as_uri(self):
453454
from urllib.parse import quote_from_bytes
454455
return prefix + quote_from_bytes(os.fsencode(path))
455456

456-
@property
457-
def _pattern_stack(self):
458-
"""Stack of path components, to be used with patterns in glob()."""
459-
parts = self._tail.copy()
460-
pattern = self._raw_path
461-
if self.anchor:
462-
raise NotImplementedError("Non-relative patterns are unsupported")
463-
elif not parts:
464-
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
465-
elif pattern[-1] in (self.parser.sep, self.parser.altsep):
466-
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
467-
parts.append('')
468-
parts.reverse()
469-
return parts
470-
471457
@property
472458
def _pattern_str(self):
473459
"""The path expressed as a string, for use in pattern-matching."""
@@ -587,13 +573,9 @@ def iterdir(self):
587573
def _scandir(self):
588574
return os.scandir(self)
589575

590-
def _direntry_str(self, entry):
591-
# Transform an entry yielded from _scandir() into a path string.
592-
return entry.name if str(self) == '.' else entry.path
593-
594576
def _make_child_direntry(self, entry):
595577
# Transform an entry yielded from _scandir() into a path object.
596-
path_str = self._direntry_str(entry)
578+
path_str = entry.name if str(self) == '.' else entry.path
597579
path = self.with_segments(path_str)
598580
path._str = path_str
599581
path._drv = self.drive
@@ -626,8 +608,18 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
626608
sys.audit("pathlib.Path.glob", self, pattern)
627609
if not isinstance(pattern, PurePath):
628610
pattern = self.with_segments(pattern)
629-
return _abc.PathBase.glob(
630-
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
611+
if pattern.anchor:
612+
raise NotImplementedError("Non-relative patterns are unsupported")
613+
parts = pattern._tail.copy()
614+
if not parts:
615+
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
616+
raw = pattern._raw_path
617+
if raw[-1] in (self.parser.sep, self.parser.altsep):
618+
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
619+
parts.append('')
620+
parts.reverse()
621+
select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
622+
return map(self.with_segments, select(str(self), exists=True))
631623

632624
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
633625
"""Recursively yield all existing files (of any kind, including
@@ -638,8 +630,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
638630
if not isinstance(pattern, PurePath):
639631
pattern = self.with_segments(pattern)
640632
pattern = '**' / pattern
641-
return _abc.PathBase.glob(
642-
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
633+
return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
643634

644635
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
645636
"""Walk the directory tree from this directory, similar to os.walk()."""

Lib/pathlib/_abc.py

+34-161
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
"""
1313

1414
import functools
15+
import operator
1516
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
1617
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
1718

19+
from . import _glob
20+
1821
#
1922
# Internals
2023
#
@@ -44,105 +47,21 @@ def _is_case_sensitive(parser):
4447
# Globbing helpers
4548
#
4649

47-
re = glob = None
48-
49-
50-
@functools.lru_cache(maxsize=512)
51-
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
52-
"""Compile given glob pattern to a re.Pattern object (observing case
53-
sensitivity)."""
54-
global re, glob
55-
if re is None:
56-
import re, glob
57-
58-
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
59-
regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
60-
return re.compile(regex, flags=flags).match
61-
62-
63-
def _select_special(paths, part):
64-
"""Yield special literal children of the given paths."""
65-
for path in paths:
66-
yield path._make_child_relpath(part)
6750

51+
class Globber(_glob.Globber):
52+
lstat = operator.methodcaller('lstat')
53+
scandir = operator.methodcaller('_scandir')
54+
add_slash = operator.methodcaller('joinpath', '')
6855

69-
def _select_children(parent_paths, dir_only, match):
70-
"""Yield direct children of given paths, filtering by name and type."""
71-
for parent_path in parent_paths:
72-
try:
73-
# We must close the scandir() object before proceeding to
74-
# avoid exhausting file descriptors when globbing deep trees.
75-
with parent_path._scandir() as scandir_it:
76-
entries = list(scandir_it)
77-
except OSError:
78-
pass
79-
else:
80-
for entry in entries:
81-
if dir_only:
82-
try:
83-
if not entry.is_dir():
84-
continue
85-
except OSError:
86-
continue
87-
# Avoid cost of making a path object for non-matching paths by
88-
# matching against the os.DirEntry.name string.
89-
if match is None or match(entry.name):
90-
yield parent_path._make_child_direntry(entry)
91-
56+
def concat_path(self, path, text):
57+
"""Appends text to the given path.
58+
"""
59+
return path.with_segments(path._raw_path + text)
9260

93-
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
94-
"""Yield given paths and all their children, recursively, filtering by
95-
string and type.
96-
"""
97-
for parent_path in parent_paths:
98-
if match is not None:
99-
# If we're filtering paths through a regex, record the length of
100-
# the parent path. We'll pass it to match(path, pos=...) later.
101-
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
102-
paths = [parent_path._make_child_relpath('')]
103-
while paths:
104-
path = paths.pop()
105-
if match is None or match(str(path), parent_len):
106-
# Yield *directory* path that matches pattern (if any).
107-
yield path
108-
try:
109-
# We must close the scandir() object before proceeding to
110-
# avoid exhausting file descriptors when globbing deep trees.
111-
with path._scandir() as scandir_it:
112-
entries = list(scandir_it)
113-
except OSError:
114-
pass
115-
else:
116-
for entry in entries:
117-
# Handle directory entry.
118-
try:
119-
if entry.is_dir(follow_symlinks=follow_symlinks):
120-
# Recurse into this directory.
121-
paths.append(path._make_child_direntry(entry))
122-
continue
123-
except OSError:
124-
pass
125-
126-
# Handle file entry.
127-
if not dir_only:
128-
# Avoid cost of making a path object for non-matching
129-
# files by matching against the os.DirEntry object.
130-
if match is None or match(path._direntry_str(entry), parent_len):
131-
# Yield *file* path that matches pattern (if any).
132-
yield path._make_child_direntry(entry)
133-
134-
135-
def _select_unique(paths):
136-
"""Yields the given paths, filtering out duplicates."""
137-
yielded = set()
138-
try:
139-
for path in paths:
140-
path_str = str(path)
141-
if path_str not in yielded:
142-
yield path
143-
yielded.add(path_str)
144-
finally:
145-
yielded.clear()
61+
def parse_entry(self, entry):
62+
"""Returns the path of an entry yielded from scandir().
63+
"""
64+
return entry
14665

14766

14867
class UnsupportedOperation(NotImplementedError):
@@ -218,6 +137,7 @@ class PurePathBase:
218137
'_resolving',
219138
)
220139
parser = ParserBase()
140+
_globber = Globber
221141

222142
def __init__(self, path, *paths):
223143
self._raw_path = self.parser.join(path, *paths) if paths else path
@@ -454,14 +374,6 @@ def is_absolute(self):
454374
a drive)."""
455375
return self.parser.isabs(self._raw_path)
456376

457-
@property
458-
def _pattern_stack(self):
459-
"""Stack of path components, to be used with patterns in glob()."""
460-
anchor, parts = self._stack
461-
if anchor:
462-
raise NotImplementedError("Non-relative patterns are unsupported")
463-
return parts
464-
465377
@property
466378
def _pattern_str(self):
467379
"""The path expressed as a string, for use in pattern-matching."""
@@ -487,8 +399,9 @@ def match(self, path_pattern, *, case_sensitive=None):
487399
return False
488400
if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
489401
return False
402+
globber = self._globber(sep, case_sensitive)
490403
for path_part, pattern_part in zip(path_parts, pattern_parts):
491-
match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False)
404+
match = globber.compile(pattern_part)
492405
if match(path_part) is None:
493406
return False
494407
return True
@@ -502,7 +415,8 @@ def full_match(self, pattern, *, case_sensitive=None):
502415
pattern = self.with_segments(pattern)
503416
if case_sensitive is None:
504417
case_sensitive = _is_case_sensitive(self.parser)
505-
match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive)
418+
globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
419+
match = globber.compile(pattern._pattern_str)
506420
return match(self._pattern_str) is not None
507421

508422

@@ -772,11 +686,6 @@ def _scandir(self):
772686
from contextlib import nullcontext
773687
return nullcontext(self.iterdir())
774688

775-
def _direntry_str(self, entry):
776-
# Transform an entry yielded from _scandir() into a path string.
777-
# PathBase._scandir() yields PathBase objects, so use str().
778-
return str(entry)
779-
780689
def _make_child_direntry(self, entry):
781690
# Transform an entry yielded from _scandir() into a path object.
782691
# PathBase._scandir() yields PathBase objects, so this is a no-op.
@@ -785,62 +694,26 @@ def _make_child_direntry(self, entry):
785694
def _make_child_relpath(self, name):
786695
return self.joinpath(name)
787696

697+
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
698+
if not self.is_dir():
699+
return iter([])
700+
if case_sensitive is None:
701+
case_sensitive = _is_case_sensitive(self.parser)
702+
recursive = True if recurse_symlinks else _glob.no_recurse_symlinks
703+
globber = self._globber(self.parser.sep, case_sensitive, recursive)
704+
return globber.selector(parts)
705+
788706
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
789707
"""Iterate over this subtree and yield all existing files (of any
790708
kind, including directories) matching the given relative pattern.
791709
"""
792710
if not isinstance(pattern, PurePathBase):
793711
pattern = self.with_segments(pattern)
794-
if case_sensitive is None:
795-
# TODO: evaluate case-sensitivity of each directory in _select_children().
796-
case_sensitive = _is_case_sensitive(self.parser)
797-
798-
stack = pattern._pattern_stack
799-
specials = ('', '.', '..')
800-
deduplicate_paths = False
801-
sep = self.parser.sep
802-
paths = iter([self] if self.is_dir() else [])
803-
while stack:
804-
part = stack.pop()
805-
if part in specials:
806-
# Join special component (e.g. '..') onto paths.
807-
paths = _select_special(paths, part)
808-
809-
elif part == '**':
810-
# Consume following '**' components, which have no effect.
811-
while stack and stack[-1] == '**':
812-
stack.pop()
813-
814-
# Consume following non-special components, provided we're
815-
# treating symlinks consistently. Each component is joined
816-
# onto 'part', which is used to generate an re.Pattern object.
817-
if recurse_symlinks:
818-
while stack and stack[-1] not in specials:
819-
part += sep + stack.pop()
820-
821-
# If the previous loop consumed pattern components, compile an
822-
# re.Pattern object based on those components.
823-
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
824-
825-
# Recursively walk directories, filtering by type and regex.
826-
paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)
827-
828-
# De-duplicate if we've already seen a '**' component.
829-
if deduplicate_paths:
830-
paths = _select_unique(paths)
831-
deduplicate_paths = True
832-
833-
elif '**' in part:
834-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
835-
836-
else:
837-
# If the pattern component isn't '*', compile an re.Pattern
838-
# object based on the component.
839-
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
840-
841-
# Iterate over directories' children filtering by type and regex.
842-
paths = _select_children(paths, bool(stack), match)
843-
return paths
712+
anchor, parts = pattern._stack
713+
if anchor:
714+
raise NotImplementedError("Non-relative patterns are unsupported")
715+
select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
716+
return select(self, exists=True)
844717

845718
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
846719
"""Recursively yield all existing files (of any kind, including

0 commit comments

Comments
 (0)