Skip to content

Commit 6258844

Browse files
authoredApr 10, 2024
pythonGH-117586: Speed up pathlib.Path.glob() by working with strings (python#117589)
Move pathlib globbing implementation into a new private class: `glob._Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects. In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`. This sets the stage for two more improvements: - pythonGH-115060: Query non-wildcard segments with `lstat()` - pythonGH-116380: Unify `pathlib` and `glob` implementations of globbing. No change to the implementations of `glob.glob()` and `glob.iglob()`.
1 parent 689ada7 commit 6258844

File tree

4 files changed

+269
-195
lines changed

4 files changed

+269
-195
lines changed
 

‎Lib/glob.py

+186
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import os
55
import re
66
import fnmatch
7+
import functools
78
import itertools
9+
import operator
810
import stat
911
import sys
1012

@@ -256,7 +258,9 @@ def escape(pathname):
256258
return drive + pathname
257259

258260

261+
_special_parts = ('', '.', '..')
259262
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
263+
_no_recurse_symlinks = object()
260264

261265

262266
def translate(pat, *, recursive=False, include_hidden=False, seps=None):
@@ -312,3 +316,185 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
312316
results.append(any_sep)
313317
res = ''.join(results)
314318
return fr'(?s:{res})\Z'
319+
320+
321+
@functools.lru_cache(maxsize=512)
322+
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
323+
"""Compile given glob pattern to a re.Pattern object (observing case
324+
sensitivity)."""
325+
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
326+
regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
327+
return re.compile(regex, flags=flags).match
328+
329+
330+
class _Globber:
331+
"""Class providing shell-style pattern matching and globbing.
332+
"""
333+
334+
def __init__(self, sep, case_sensitive, recursive=False):
335+
self.sep = sep
336+
self.case_sensitive = case_sensitive
337+
self.recursive = recursive
338+
339+
# Low-level methods
340+
341+
lstat = staticmethod(os.lstat)
342+
scandir = staticmethod(os.scandir)
343+
parse_entry = operator.attrgetter('path')
344+
concat_path = operator.add
345+
346+
if os.name == 'nt':
347+
@staticmethod
348+
def add_slash(pathname):
349+
tail = os.path.splitroot(pathname)[2]
350+
if not tail or tail[-1] in '\\/':
351+
return pathname
352+
return f'{pathname}\\'
353+
else:
354+
@staticmethod
355+
def add_slash(pathname):
356+
if not pathname or pathname[-1] == '/':
357+
return pathname
358+
return f'{pathname}/'
359+
360+
# High-level methods
361+
362+
def compile(self, pat):
363+
return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)
364+
365+
def selector(self, parts):
366+
"""Returns a function that selects from a given path, walking and
367+
filtering according to the glob-style pattern parts in *parts*.
368+
"""
369+
if not parts:
370+
return self.select_exists
371+
part = parts.pop()
372+
if self.recursive and part == '**':
373+
selector = self.recursive_selector
374+
elif part in _special_parts:
375+
selector = self.special_selector
376+
else:
377+
selector = self.wildcard_selector
378+
return selector(part, parts)
379+
380+
def special_selector(self, part, parts):
381+
"""Returns a function that selects special children of the given path.
382+
"""
383+
select_next = self.selector(parts)
384+
385+
def select_special(path, exists=False):
386+
path = self.concat_path(self.add_slash(path), part)
387+
return select_next(path, exists)
388+
return select_special
389+
390+
def wildcard_selector(self, part, parts):
391+
"""Returns a function that selects direct children of a given path,
392+
filtering by pattern.
393+
"""
394+
395+
match = None if part == '*' else self.compile(part)
396+
dir_only = bool(parts)
397+
if dir_only:
398+
select_next = self.selector(parts)
399+
400+
def select_wildcard(path, exists=False):
401+
try:
402+
# We must close the scandir() object before proceeding to
403+
# avoid exhausting file descriptors when globbing deep trees.
404+
with self.scandir(path) as scandir_it:
405+
entries = list(scandir_it)
406+
except OSError:
407+
pass
408+
else:
409+
for entry in entries:
410+
if match is None or match(entry.name):
411+
if dir_only:
412+
try:
413+
if not entry.is_dir():
414+
continue
415+
except OSError:
416+
continue
417+
entry_path = self.parse_entry(entry)
418+
if dir_only:
419+
yield from select_next(entry_path, exists=True)
420+
else:
421+
yield entry_path
422+
return select_wildcard
423+
424+
def recursive_selector(self, part, parts):
425+
"""Returns a function that selects a given path and all its children,
426+
recursively, filtering by pattern.
427+
"""
428+
# Optimization: consume following '**' parts, which have no effect.
429+
while parts and parts[-1] == '**':
430+
parts.pop()
431+
432+
# Optimization: consume and join any following non-special parts here,
433+
# rather than leaving them for the next selector. They're used to
434+
# build a regular expression, which we use to filter the results of
435+
# the recursive walk. As a result, non-special pattern segments
436+
# following a '**' wildcard don't require additional filesystem access
437+
# to expand.
438+
follow_symlinks = self.recursive is not _no_recurse_symlinks
439+
if follow_symlinks:
440+
while parts and parts[-1] not in _special_parts:
441+
part += self.sep + parts.pop()
442+
443+
match = None if part == '**' else self.compile(part)
444+
dir_only = bool(parts)
445+
select_next = self.selector(parts)
446+
447+
def select_recursive(path, exists=False):
448+
path = self.add_slash(path)
449+
match_pos = len(str(path))
450+
if match is None or match(str(path), match_pos):
451+
yield from select_next(path, exists)
452+
stack = [path]
453+
while stack:
454+
yield from select_recursive_step(stack, match_pos)
455+
456+
def select_recursive_step(stack, match_pos):
457+
path = stack.pop()
458+
try:
459+
# We must close the scandir() object before proceeding to
460+
# avoid exhausting file descriptors when globbing deep trees.
461+
with self.scandir(path) as scandir_it:
462+
entries = list(scandir_it)
463+
except OSError:
464+
pass
465+
else:
466+
for entry in entries:
467+
is_dir = False
468+
try:
469+
if entry.is_dir(follow_symlinks=follow_symlinks):
470+
is_dir = True
471+
except OSError:
472+
pass
473+
474+
if is_dir or not dir_only:
475+
entry_path = self.parse_entry(entry)
476+
if match is None or match(str(entry_path), match_pos):
477+
if dir_only:
478+
yield from select_next(entry_path, exists=True)
479+
else:
480+
# Optimization: directly yield the path if this is
481+
# last pattern part.
482+
yield entry_path
483+
if is_dir:
484+
stack.append(entry_path)
485+
486+
return select_recursive
487+
488+
def select_exists(self, path, exists=False):
489+
"""Yields the given path, if it exists.
490+
"""
491+
if exists:
492+
# Optimization: this path is already known to exist, e.g. because
493+
# it was returned from os.scandir(), so we skip calling lstat().
494+
yield path
495+
else:
496+
try:
497+
self.lstat(path)
498+
yield path
499+
except OSError:
500+
pass

‎Lib/pathlib/__init__.py

+47-30
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
operating systems.
66
"""
77

8+
import glob
89
import io
910
import ntpath
11+
import operator
1012
import os
1113
import posixpath
1214
import sys
@@ -111,6 +113,7 @@ class PurePath(_abc.PurePathBase):
111113
'_hash',
112114
)
113115
parser = os.path
116+
_globber = glob._Globber
114117

115118
def __new__(cls, *args, **kwargs):
116119
"""Construct a PurePath from one or several strings and or existing
@@ -253,14 +256,17 @@ def _format_parsed_parts(cls, drv, root, tail):
253256
return cls.parser.sep.join(tail)
254257

255258
def _from_parsed_parts(self, drv, root, tail):
256-
path_str = self._format_parsed_parts(drv, root, tail)
257-
path = self.with_segments(path_str)
258-
path._str = path_str or '.'
259+
path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail))
259260
path._drv = drv
260261
path._root = root
261262
path._tail_cached = tail
262263
return path
263264

265+
def _from_parsed_string(self, path_str):
266+
path = self.with_segments(path_str)
267+
path._str = path_str or '.'
268+
return path
269+
264270
@classmethod
265271
def _parse_path(cls, path):
266272
if not path:
@@ -453,21 +459,6 @@ def as_uri(self):
453459
from urllib.parse import quote_from_bytes
454460
return prefix + quote_from_bytes(os.fsencode(path))
455461

456-
@property
457-
def _pattern_stack(self):
458-
"""Stack of path components, to be used with patterns in glob()."""
459-
parts = self._tail.copy()
460-
pattern = self._raw_path
461-
if self.anchor:
462-
raise NotImplementedError("Non-relative patterns are unsupported")
463-
elif not parts:
464-
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
465-
elif pattern[-1] in (self.parser.sep, self.parser.altsep):
466-
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
467-
parts.append('')
468-
parts.reverse()
469-
return parts
470-
471462
@property
472463
def _pattern_str(self):
473464
"""The path expressed as a string, for use in pattern-matching."""
@@ -576,6 +567,17 @@ def write_text(self, data, encoding=None, errors=None, newline=None):
576567
encoding = io.text_encoding(encoding)
577568
return _abc.PathBase.write_text(self, data, encoding, errors, newline)
578569

570+
_remove_leading_dot = operator.itemgetter(slice(2, None))
571+
_remove_trailing_slash = operator.itemgetter(slice(-1))
572+
573+
def _filter_trailing_slash(self, paths):
574+
sep = self.parser.sep
575+
anchor_len = len(self.anchor)
576+
for path_str in paths:
577+
if len(path_str) > anchor_len and path_str[-1] == sep:
578+
path_str = path_str[:-1]
579+
yield path_str
580+
579581
def iterdir(self):
580582
"""Yield path objects of the directory contents.
581583
@@ -587,13 +589,9 @@ def iterdir(self):
587589
def _scandir(self):
588590
return os.scandir(self)
589591

590-
def _direntry_str(self, entry):
591-
# Transform an entry yielded from _scandir() into a path string.
592-
return entry.name if str(self) == '.' else entry.path
593-
594592
def _make_child_direntry(self, entry):
595593
# Transform an entry yielded from _scandir() into a path object.
596-
path_str = self._direntry_str(entry)
594+
path_str = entry.name if str(self) == '.' else entry.path
597595
path = self.with_segments(path_str)
598596
path._str = path_str
599597
path._drv = self.drive
@@ -626,8 +624,30 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
626624
sys.audit("pathlib.Path.glob", self, pattern)
627625
if not isinstance(pattern, PurePath):
628626
pattern = self.with_segments(pattern)
629-
return _abc.PathBase.glob(
630-
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
627+
if pattern.anchor:
628+
raise NotImplementedError("Non-relative patterns are unsupported")
629+
parts = pattern._tail.copy()
630+
if not parts:
631+
raise ValueError("Unacceptable pattern: {!r}".format(pattern))
632+
raw = pattern._raw_path
633+
if raw[-1] in (self.parser.sep, self.parser.altsep):
634+
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
635+
parts.append('')
636+
if not self.is_dir():
637+
return iter([])
638+
select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks)
639+
root = str(self)
640+
paths = select(root, exists=True)
641+
642+
# Normalize results
643+
if root == '.':
644+
paths = map(self._remove_leading_dot, paths)
645+
if parts[-1] == '':
646+
paths = map(self._remove_trailing_slash, paths)
647+
elif parts[-1] == '**':
648+
paths = self._filter_trailing_slash(paths)
649+
paths = map(self._from_parsed_string, paths)
650+
return paths
631651

632652
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
633653
"""Recursively yield all existing files (of any kind, including
@@ -638,8 +658,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
638658
if not isinstance(pattern, PurePath):
639659
pattern = self.with_segments(pattern)
640660
pattern = '**' / pattern
641-
return _abc.PathBase.glob(
642-
self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
661+
return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
643662

644663
def walk(self, top_down=True, on_error=None, follow_symlinks=False):
645664
"""Walk the directory tree from this directory, similar to os.walk()."""
@@ -669,9 +688,7 @@ def absolute(self):
669688
# of joining, and we exploit the fact that getcwd() returns a
670689
# fully-normalized string by storing it in _str. This is used to
671690
# implement Path.cwd().
672-
result = self.with_segments(cwd)
673-
result._str = cwd
674-
return result
691+
return self._from_parsed_string(cwd)
675692
drive, root, rel = os.path.splitroot(cwd)
676693
if not rel:
677694
return self._from_parsed_parts(drive, root, self._tail)

‎Lib/pathlib/_abc.py

+35-165
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
"""
1313

1414
import functools
15+
import glob
16+
import operator
1517
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
1618
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
1719

@@ -40,109 +42,23 @@ def _ignore_error(exception):
4042
def _is_case_sensitive(parser):
4143
return parser.normcase('Aa') == 'Aa'
4244

43-
#
44-
# Globbing helpers
45-
#
46-
47-
re = glob = None
48-
49-
50-
@functools.lru_cache(maxsize=512)
51-
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
52-
"""Compile given glob pattern to a re.Pattern object (observing case
53-
sensitivity)."""
54-
global re, glob
55-
if re is None:
56-
import re, glob
57-
58-
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
59-
regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
60-
return re.compile(regex, flags=flags).match
6145

46+
class Globber(glob._Globber):
47+
lstat = operator.methodcaller('lstat')
48+
scandir = operator.methodcaller('_scandir')
49+
add_slash = operator.methodcaller('joinpath', '')
6250

63-
def _select_special(paths, part):
64-
"""Yield special literal children of the given paths."""
65-
for path in paths:
66-
yield path._make_child_relpath(part)
67-
68-
69-
def _select_children(parent_paths, dir_only, match):
70-
"""Yield direct children of given paths, filtering by name and type."""
71-
for parent_path in parent_paths:
72-
try:
73-
# We must close the scandir() object before proceeding to
74-
# avoid exhausting file descriptors when globbing deep trees.
75-
with parent_path._scandir() as scandir_it:
76-
entries = list(scandir_it)
77-
except OSError:
78-
pass
79-
else:
80-
for entry in entries:
81-
if dir_only:
82-
try:
83-
if not entry.is_dir():
84-
continue
85-
except OSError:
86-
continue
87-
# Avoid cost of making a path object for non-matching paths by
88-
# matching against the os.DirEntry.name string.
89-
if match is None or match(entry.name):
90-
yield parent_path._make_child_direntry(entry)
91-
51+
@staticmethod
52+
def concat_path(path, text):
53+
"""Appends text to the given path.
54+
"""
55+
return path.with_segments(path._raw_path + text)
9256

93-
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
94-
"""Yield given paths and all their children, recursively, filtering by
95-
string and type.
96-
"""
97-
for parent_path in parent_paths:
98-
if match is not None:
99-
# If we're filtering paths through a regex, record the length of
100-
# the parent path. We'll pass it to match(path, pos=...) later.
101-
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
102-
paths = [parent_path._make_child_relpath('')]
103-
while paths:
104-
path = paths.pop()
105-
if match is None or match(str(path), parent_len):
106-
# Yield *directory* path that matches pattern (if any).
107-
yield path
108-
try:
109-
# We must close the scandir() object before proceeding to
110-
# avoid exhausting file descriptors when globbing deep trees.
111-
with path._scandir() as scandir_it:
112-
entries = list(scandir_it)
113-
except OSError:
114-
pass
115-
else:
116-
for entry in entries:
117-
# Handle directory entry.
118-
try:
119-
if entry.is_dir(follow_symlinks=follow_symlinks):
120-
# Recurse into this directory.
121-
paths.append(path._make_child_direntry(entry))
122-
continue
123-
except OSError:
124-
pass
125-
126-
# Handle file entry.
127-
if not dir_only:
128-
# Avoid cost of making a path object for non-matching
129-
# files by matching against the os.DirEntry object.
130-
if match is None or match(path._direntry_str(entry), parent_len):
131-
# Yield *file* path that matches pattern (if any).
132-
yield path._make_child_direntry(entry)
133-
134-
135-
def _select_unique(paths):
136-
"""Yields the given paths, filtering out duplicates."""
137-
yielded = set()
138-
try:
139-
for path in paths:
140-
path_str = str(path)
141-
if path_str not in yielded:
142-
yield path
143-
yielded.add(path_str)
144-
finally:
145-
yielded.clear()
57+
@staticmethod
58+
def parse_entry(entry):
59+
"""Returns the path of an entry yielded from scandir().
60+
"""
61+
return entry
14662

14763

14864
class UnsupportedOperation(NotImplementedError):
@@ -218,6 +134,7 @@ class PurePathBase:
218134
'_resolving',
219135
)
220136
parser = ParserBase()
137+
_globber = Globber
221138

222139
def __init__(self, path, *paths):
223140
self._raw_path = self.parser.join(path, *paths) if paths else path
@@ -454,14 +371,6 @@ def is_absolute(self):
454371
a drive)."""
455372
return self.parser.isabs(self._raw_path)
456373

457-
@property
458-
def _pattern_stack(self):
459-
"""Stack of path components, to be used with patterns in glob()."""
460-
anchor, parts = self._stack
461-
if anchor:
462-
raise NotImplementedError("Non-relative patterns are unsupported")
463-
return parts
464-
465374
@property
466375
def _pattern_str(self):
467376
"""The path expressed as a string, for use in pattern-matching."""
@@ -487,8 +396,9 @@ def match(self, path_pattern, *, case_sensitive=None):
487396
return False
488397
if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
489398
return False
399+
globber = self._globber(sep, case_sensitive)
490400
for path_part, pattern_part in zip(path_parts, pattern_parts):
491-
match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False)
401+
match = globber.compile(pattern_part)
492402
if match(path_part) is None:
493403
return False
494404
return True
@@ -502,7 +412,8 @@ def full_match(self, pattern, *, case_sensitive=None):
502412
pattern = self.with_segments(pattern)
503413
if case_sensitive is None:
504414
case_sensitive = _is_case_sensitive(self.parser)
505-
match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive)
415+
globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
416+
match = globber.compile(pattern._pattern_str)
506417
return match(self._pattern_str) is not None
507418

508419

@@ -772,11 +683,6 @@ def _scandir(self):
772683
from contextlib import nullcontext
773684
return nullcontext(self.iterdir())
774685

775-
def _direntry_str(self, entry):
776-
# Transform an entry yielded from _scandir() into a path string.
777-
# PathBase._scandir() yields PathBase objects, so use str().
778-
return str(entry)
779-
780686
def _make_child_direntry(self, entry):
781687
# Transform an entry yielded from _scandir() into a path object.
782688
# PathBase._scandir() yields PathBase objects, so this is a no-op.
@@ -785,62 +691,26 @@ def _make_child_direntry(self, entry):
785691
def _make_child_relpath(self, name):
786692
return self.joinpath(name)
787693

694+
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
695+
if case_sensitive is None:
696+
case_sensitive = _is_case_sensitive(self.parser)
697+
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
698+
globber = self._globber(self.parser.sep, case_sensitive, recursive)
699+
return globber.selector(parts)
700+
788701
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
789702
"""Iterate over this subtree and yield all existing files (of any
790703
kind, including directories) matching the given relative pattern.
791704
"""
792705
if not isinstance(pattern, PurePathBase):
793706
pattern = self.with_segments(pattern)
794-
if case_sensitive is None:
795-
# TODO: evaluate case-sensitivity of each directory in _select_children().
796-
case_sensitive = _is_case_sensitive(self.parser)
797-
798-
stack = pattern._pattern_stack
799-
specials = ('', '.', '..')
800-
deduplicate_paths = False
801-
sep = self.parser.sep
802-
paths = iter([self] if self.is_dir() else [])
803-
while stack:
804-
part = stack.pop()
805-
if part in specials:
806-
# Join special component (e.g. '..') onto paths.
807-
paths = _select_special(paths, part)
808-
809-
elif part == '**':
810-
# Consume following '**' components, which have no effect.
811-
while stack and stack[-1] == '**':
812-
stack.pop()
813-
814-
# Consume following non-special components, provided we're
815-
# treating symlinks consistently. Each component is joined
816-
# onto 'part', which is used to generate an re.Pattern object.
817-
if recurse_symlinks:
818-
while stack and stack[-1] not in specials:
819-
part += sep + stack.pop()
820-
821-
# If the previous loop consumed pattern components, compile an
822-
# re.Pattern object based on those components.
823-
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
824-
825-
# Recursively walk directories, filtering by type and regex.
826-
paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)
827-
828-
# De-duplicate if we've already seen a '**' component.
829-
if deduplicate_paths:
830-
paths = _select_unique(paths)
831-
deduplicate_paths = True
832-
833-
elif '**' in part:
834-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
835-
836-
else:
837-
# If the pattern component isn't '*', compile an re.Pattern
838-
# object based on the component.
839-
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
840-
841-
# Iterate over directories' children filtering by type and regex.
842-
paths = _select_children(paths, bool(stack), match)
843-
return paths
707+
anchor, parts = pattern._stack
708+
if anchor:
709+
raise NotImplementedError("Non-relative patterns are unsupported")
710+
if not self.is_dir():
711+
return iter([])
712+
select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
713+
return select(self, exists=True)
844714

845715
def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
846716
"""Recursively yield all existing files (of any kind, including
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up :meth:`pathlib.Path.glob` by working with strings internally.

0 commit comments

Comments
 (0)
Please sign in to comment.