From fe2c46c9f07bbfaf8391cb9fe4179bc6e25665b2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 19:19:02 +0100 Subject: [PATCH 1/8] GH-117586: Speed up `pathlib.Path.glob()` by working with strings Move pathlib globbing implementation to a new module and class: `pathlib._glob.Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects. In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`. This sets the stage for two more improvements: - GH-115060: Query non-wildcard segments with `lstat()` - GH-116380: Move `pathlib._glob` to `glob` (unify implementations). --- Lib/pathlib/__init__.py | 41 ++-- Lib/pathlib/_abc.py | 195 +++--------------- Lib/pathlib/_glob.py | 191 +++++++++++++++++ ...-04-06-20-31-09.gh-issue-117586.UgWdRK.rst | 1 + 4 files changed, 242 insertions(+), 186 deletions(-) create mode 100644 Lib/pathlib/_glob.py create mode 100644 Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 747000f1a43475..c3b898a2131d5a 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -23,7 +23,7 @@ except ImportError: grp = None -from . import _abc +from . import _abc, _glob __all__ = [ @@ -111,6 +111,7 @@ class PurePath(_abc.PurePathBase): '_hash', ) parser = os.path + _globber = _glob.Globber def __new__(cls, *args, **kwargs): """Construct a PurePath from one or several strings and or existing @@ -453,21 +454,6 @@ def as_uri(self): from urllib.parse import quote_from_bytes return prefix + quote_from_bytes(os.fsencode(path)) - @property - def _pattern_stack(self): - """Stack of path components, to be used with patterns in glob().""" - parts = self._tail.copy() - pattern = self._raw_path - if self.anchor: - raise NotImplementedError("Non-relative patterns are unsupported") - elif not parts: - raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - elif pattern[-1] in (self.parser.sep, self.parser.altsep): - # GH-65238: pathlib doesn't preserve trailing slash. Add it back. - parts.append('') - parts.reverse() - return parts - @property def _pattern_str(self): """The path expressed as a string, for use in pattern-matching.""" @@ -587,13 +573,9 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _direntry_str(self, entry): - # Transform an entry yielded from _scandir() into a path string. - return entry.name if str(self) == '.' else entry.path - def _make_child_direntry(self, entry): # Transform an entry yielded from _scandir() into a path object. - path_str = self._direntry_str(entry) + path_str = entry.name if str(self) == '.' else entry.path path = self.with_segments(path_str) path._str = path_str path._drv = self.drive @@ -626,8 +608,18 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): sys.audit("pathlib.Path.glob", self, pattern) if not isinstance(pattern, PurePath): pattern = self.with_segments(pattern) - return _abc.PathBase.glob( - self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) + if pattern.anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + parts = pattern._tail.copy() + if not parts: + raise ValueError("Unacceptable pattern: {!r}".format(pattern)) + raw = pattern._raw_path + if raw[-1] in (self.parser.sep, self.parser.altsep): + # GH-65238: pathlib doesn't preserve trailing slash. Add it back. + parts.append('') + parts.reverse() + select = self._glob_selector(parts, case_sensitive, recurse_symlinks) + return map(self.with_segments, select(str(self), exists=True)) def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Recursively yield all existing files (of any kind, including @@ -638,8 +630,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): if not isinstance(pattern, PurePath): pattern = self.with_segments(pattern) pattern = '**' / pattern - return _abc.PathBase.glob( - self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) + return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks) def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index ca38a51d072cfb..ccfb525191e242 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -12,9 +12,12 @@ """ import functools +import operator from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO +from . import _glob + # # Internals # @@ -44,105 +47,21 @@ def _is_case_sensitive(parser): # Globbing helpers # -re = glob = None - - -@functools.lru_cache(maxsize=512) -def _compile_pattern(pat, sep, case_sensitive, recursive=True): - """Compile given glob pattern to a re.Pattern object (observing case - sensitivity).""" - global re, glob - if re is None: - import re, glob - - flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep) - return re.compile(regex, flags=flags).match - - -def _select_special(paths, part): - """Yield special literal children of the given paths.""" - for path in paths: - yield path._make_child_relpath(part) +class Globber(_glob.Globber): + lstat = operator.methodcaller('lstat') + scandir = operator.methodcaller('_scandir') + add_slash = operator.methodcaller('joinpath', '') -def _select_children(parent_paths, dir_only, match): - """Yield direct children of given paths, filtering by name and type.""" - for parent_path in parent_paths: - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with parent_path._scandir() as scandir_it: - entries = list(scandir_it) - except OSError: - pass - else: - for entry in entries: - if dir_only: - try: - if not entry.is_dir(): - continue - except OSError: - continue - # Avoid cost of making a path object for non-matching paths by - # matching against the os.DirEntry.name string. - if match is None or match(entry.name): - yield parent_path._make_child_direntry(entry) - + def concat_path(self, path, text): + """Appends text to the given path. + """ + return path.with_segments(path._raw_path + text) -def _select_recursive(parent_paths, dir_only, follow_symlinks, match): - """Yield given paths and all their children, recursively, filtering by - string and type. - """ - for parent_path in parent_paths: - if match is not None: - # If we're filtering paths through a regex, record the length of - # the parent path. We'll pass it to match(path, pos=...) later. - parent_len = len(str(parent_path._make_child_relpath('_'))) - 1 - paths = [parent_path._make_child_relpath('')] - while paths: - path = paths.pop() - if match is None or match(str(path), parent_len): - # Yield *directory* path that matches pattern (if any). - yield path - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with path._scandir() as scandir_it: - entries = list(scandir_it) - except OSError: - pass - else: - for entry in entries: - # Handle directory entry. - try: - if entry.is_dir(follow_symlinks=follow_symlinks): - # Recurse into this directory. - paths.append(path._make_child_direntry(entry)) - continue - except OSError: - pass - - # Handle file entry. - if not dir_only: - # Avoid cost of making a path object for non-matching - # files by matching against the os.DirEntry object. - if match is None or match(path._direntry_str(entry), parent_len): - # Yield *file* path that matches pattern (if any). - yield path._make_child_direntry(entry) - - -def _select_unique(paths): - """Yields the given paths, filtering out duplicates.""" - yielded = set() - try: - for path in paths: - path_str = str(path) - if path_str not in yielded: - yield path - yielded.add(path_str) - finally: - yielded.clear() + def parse_entry(self, entry): + """Returns the path of an entry yielded from scandir(). + """ + return entry class UnsupportedOperation(NotImplementedError): @@ -218,6 +137,7 @@ class PurePathBase: '_resolving', ) parser = ParserBase() + _globber = Globber def __init__(self, path, *paths): self._raw_path = self.parser.join(path, *paths) if paths else path @@ -454,14 +374,6 @@ def is_absolute(self): a drive).""" return self.parser.isabs(self._raw_path) - @property - def _pattern_stack(self): - """Stack of path components, to be used with patterns in glob().""" - anchor, parts = self._stack - if anchor: - raise NotImplementedError("Non-relative patterns are unsupported") - return parts - @property def _pattern_str(self): """The path expressed as a string, for use in pattern-matching.""" @@ -487,8 +399,9 @@ def match(self, path_pattern, *, case_sensitive=None): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False + globber = self._globber(sep, case_sensitive) for path_part, pattern_part in zip(path_parts, pattern_parts): - match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False) + match = globber.compile(pattern_part) if match(path_part) is None: return False return True @@ -502,7 +415,8 @@ def full_match(self, pattern, *, case_sensitive=None): pattern = self.with_segments(pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) - match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive) + globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True) + match = globber.compile(pattern._pattern_str) return match(self._pattern_str) is not None @@ -772,11 +686,6 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _direntry_str(self, entry): - # Transform an entry yielded from _scandir() into a path string. - # PathBase._scandir() yields PathBase objects, so use str(). - return str(entry) - def _make_child_direntry(self, entry): # Transform an entry yielded from _scandir() into a path object. # PathBase._scandir() yields PathBase objects, so this is a no-op. @@ -785,62 +694,26 @@ def _make_child_direntry(self, entry): def _make_child_relpath(self, name): return self.joinpath(name) + def _glob_selector(self, parts, case_sensitive, recurse_symlinks): + if not self.is_dir(): + return iter([]) + if case_sensitive is None: + case_sensitive = _is_case_sensitive(self.parser) + recursive = True if recurse_symlinks else _glob.no_recurse_symlinks + globber = self._globber(self.parser.sep, case_sensitive, recursive) + return globber.selector(parts) + def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): """Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. """ if not isinstance(pattern, PurePathBase): pattern = self.with_segments(pattern) - if case_sensitive is None: - # TODO: evaluate case-sensitivity of each directory in _select_children(). - case_sensitive = _is_case_sensitive(self.parser) - - stack = pattern._pattern_stack - specials = ('', '.', '..') - deduplicate_paths = False - sep = self.parser.sep - paths = iter([self] if self.is_dir() else []) - while stack: - part = stack.pop() - if part in specials: - # Join special component (e.g. '..') onto paths. - paths = _select_special(paths, part) - - elif part == '**': - # Consume following '**' components, which have no effect. - while stack and stack[-1] == '**': - stack.pop() - - # Consume following non-special components, provided we're - # treating symlinks consistently. Each component is joined - # onto 'part', which is used to generate an re.Pattern object. - if recurse_symlinks: - while stack and stack[-1] not in specials: - part += sep + stack.pop() - - # If the previous loop consumed pattern components, compile an - # re.Pattern object based on those components. - match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None - - # Recursively walk directories, filtering by type and regex. - paths = _select_recursive(paths, bool(stack), recurse_symlinks, match) - - # De-duplicate if we've already seen a '**' component. - if deduplicate_paths: - paths = _select_unique(paths) - deduplicate_paths = True - - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - - else: - # If the pattern component isn't '*', compile an re.Pattern - # object based on the component. - match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None - - # Iterate over directories' children filtering by type and regex. - paths = _select_children(paths, bool(stack), match) - return paths + anchor, parts = pattern._stack + if anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + select = self._glob_selector(parts, case_sensitive, recurse_symlinks) + return select(self, exists=True) def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): """Recursively yield all existing files (of any kind, including diff --git a/Lib/pathlib/_glob.py b/Lib/pathlib/_glob.py new file mode 100644 index 00000000000000..6f7abb15cee5a1 --- /dev/null +++ b/Lib/pathlib/_glob.py @@ -0,0 +1,191 @@ +import functools +import operator +import os + + +re = glob = None +special_parts = ('', '.', '..') +no_recurse_symlinks = object() + + +@functools.lru_cache(maxsize=512) +def compile_pattern(pat, sep, case_sensitive, recursive=True): + """Compile given glob pattern to a re.Pattern object (observing case + sensitivity).""" + global re, glob + if re is None: + import re, glob + + flags = re.NOFLAG if case_sensitive else re.IGNORECASE + regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep) + return re.compile(regex, flags=flags).match + + +class Globber: + """Class providing shell-style pattern matching and globbing. + """ + + def __init__(self, sep, case_sensitive, recursive=False): + self.sep = sep + self.case_sensitive = case_sensitive + self.recursive = recursive + + # Low-level methods + + lstat = staticmethod(os.lstat) + scandir = staticmethod(os.scandir) + parse_entry = operator.attrgetter('path') + concat_path = operator.add + + if os.name == 'nt': + def add_slash(self, pathname): + tail = os.path.splitroot(pathname)[2] + if not tail or tail[-1] in '\\/': + return pathname + return f'{pathname}\\' + else: + def add_slash(self, pathname): + if not pathname or pathname[-1] == '/': + return pathname + return f'{pathname}/' + + # High-level methods + + def compile(self, pat): + return compile_pattern(pat, self.sep, self.case_sensitive, self.recursive) + + def selector(self, parts): + """Returns a function that selects from a given path, walking and + filtering according to the glob-style pattern parts in *parts*. + """ + if not parts: + return self.select_exists + part = parts.pop() + if self.recursive and part == '**': + selector = self.recursive_selector + elif part in special_parts: + selector = self.special_selector + else: + selector = self.wildcard_selector + return selector(part, parts) + + def special_selector(self, part, parts): + """Returns a function that selects special children of the given path. + """ + select_next = self.selector(parts) + + def select_special(path, exists=False): + path = self.concat_path(self.add_slash(path), part) + return select_next(path, exists) + return select_special + + def wildcard_selector(self, part, parts): + """Returns a function that selects direct children of a given path, + filtering by pattern. + """ + + match = None if part == '*' else self.compile(part) + dir_only = bool(parts) + if dir_only: + select_next = self.selector(parts) + + def select_wildcard(path, exists=False): + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + for entry in entries: + if match is None or match(entry.name): + if dir_only: + try: + if not entry.is_dir(): + continue + except OSError: + continue + entry_path = self.parse_entry(entry) + if dir_only: + yield from select_next(entry_path, exists=True) + else: + yield entry_path + except OSError: + pass + return select_wildcard + + def recursive_selector(self, part, parts): + """Returns a function that selects a given path and all its children, + recursively, filtering by pattern. + """ + # Optimization: consume following '**' parts, which have no effect. + while parts and parts[-1] == '**': + parts.pop() + + # Optimization: consume and join any following non-special parts here, + # rather than leaving them for the next selector. They're used to + # build a regular expression, which we use to filter the results of + # the recursive walk. As a result, non-special pattern segments + # following a '**' wildcard don't require additional filesystem access + # to expand. + follow_symlinks = self.recursive is not no_recurse_symlinks + if follow_symlinks: + while parts and parts[-1] not in special_parts: + part += self.sep + parts.pop() + + match = None if part == '**' else self.compile(part) + dir_only = bool(parts) + select_next = self.selector(parts) + + def select_recursive(path, exists=False): + path = self.add_slash(path) + match_pos = len(str(path)) + if match is None or match(str(path), match_pos): + yield from select_next(path, exists) + stack = [path] + while stack: + yield from select_recursive_step(stack, match_pos) + + def select_recursive_step(stack, match_pos): + path = stack.pop() + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + except OSError: + pass + else: + for entry in entries: + is_dir = False + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + is_dir = True + except OSError: + pass + + if is_dir or not dir_only: + entry_path = self.parse_entry(entry) + if match is None or match(str(entry_path), match_pos): + if dir_only: + yield from select_next(entry_path, exists=True) + else: + # Optimization: directly yield the path if this is + # last pattern part. + yield entry_path + if is_dir: + stack.append(entry_path) + + return select_recursive + + def select_exists(self, path, exists=False): + """Yields the given path, if it exists. + """ + if exists: + # Optimization: this path is already known to exist, e.g. because + # it was returned from os.scandir(), so we skip calling lstat(). + yield path + else: + try: + self.lstat(path) + yield path + except OSError: + pass diff --git a/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst b/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst new file mode 100644 index 00000000000000..65c699977bd807 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst @@ -0,0 +1 @@ +Speed up :meth:`pathlib.Path.glob` by working with strings internally. From 26d2c03f3a46e797ed1b8d4edd19e0837bbf1cd2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 20:57:48 +0100 Subject: [PATCH 2/8] Move class into `glob` module. --- Lib/glob.py | 185 ++++++++++++++++++++++++++++++++++++++ Lib/pathlib/__init__.py | 5 +- Lib/pathlib/_abc.py | 11 +-- Lib/pathlib/_glob.py | 191 ---------------------------------------- 4 files changed, 191 insertions(+), 201 deletions(-) delete mode 100644 Lib/pathlib/_glob.py diff --git a/Lib/glob.py b/Lib/glob.py index a915cf0bdf4502..e773c3acf32259 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -4,7 +4,9 @@ import os import re import fnmatch +import functools import itertools +import operator import stat import sys @@ -256,7 +258,9 @@ def escape(pathname): return drive + pathname +_special_parts = ('', '.', '..') _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) +_no_recurse_symlinks = object() def translate(pat, *, recursive=False, include_hidden=False, seps=None): @@ -312,3 +316,184 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): results.append(any_sep) res = ''.join(results) return fr'(?s:{res})\Z' + + +@functools.lru_cache(maxsize=512) +def _compile_pattern(pat, sep, case_sensitive, recursive=True): + """Compile given glob pattern to a re.Pattern object (observing case + sensitivity).""" + flags = re.NOFLAG if case_sensitive else re.IGNORECASE + regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep) + return re.compile(regex, flags=flags).match + + +if os.name == 'nt': + def _add_slash(pathname): + tail = os.path.splitroot(pathname)[2] + if not tail or tail[-1] in '\\/': + return pathname + return f'{pathname}\\' +else: + def _add_slash(pathname): + if not pathname or pathname[-1] == '/': + return pathname + return f'{pathname}/' + + +class _Globber: + """Class providing shell-style pattern matching and globbing. + """ + + def __init__(self, sep, case_sensitive, recursive=False): + self.sep = sep + self.case_sensitive = case_sensitive + self.recursive = recursive + + # Low-level methods + + lstat = staticmethod(os.lstat) + scandir = staticmethod(os.scandir) + add_slash = staticmethod(_add_slash) + concat_path = operator.add + parse_entry = operator.attrgetter('path') + + # High-level methods + + def compile(self, pat): + return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive) + + def selector(self, parts): + """Returns a function that selects from a given path, walking and + filtering according to the glob-style pattern parts in *parts*. + """ + if not parts: + return self.select_exists + part = parts.pop() + if self.recursive and part == '**': + selector = self.recursive_selector + elif part in _special_parts: + selector = self.special_selector + else: + selector = self.wildcard_selector + return selector(part, parts) + + def special_selector(self, part, parts): + """Returns a function that selects special children of the given path. + """ + select_next = self.selector(parts) + + def select_special(path, exists=False): + path = self.concat_path(self.add_slash(path), part) + return select_next(path, exists) + return select_special + + def wildcard_selector(self, part, parts): + """Returns a function that selects direct children of a given path, + filtering by pattern. + """ + + match = None if part == '*' else self.compile(part) + dir_only = bool(parts) + if dir_only: + select_next = self.selector(parts) + + def select_wildcard(path, exists=False): + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + for entry in entries: + if match is None or match(entry.name): + if dir_only: + try: + if not entry.is_dir(): + continue + except OSError: + continue + entry_path = self.parse_entry(entry) + if dir_only: + yield from select_next(entry_path, exists=True) + else: + yield entry_path + except OSError: + pass + return select_wildcard + + def recursive_selector(self, part, parts): + """Returns a function that selects a given path and all its children, + recursively, filtering by pattern. + """ + # Optimization: consume following '**' parts, which have no effect. + while parts and parts[-1] == '**': + parts.pop() + + # Optimization: consume and join any following non-special parts here, + # rather than leaving them for the next selector. They're used to + # build a regular expression, which we use to filter the results of + # the recursive walk. As a result, non-special pattern segments + # following a '**' wildcard don't require additional filesystem access + # to expand. + follow_symlinks = self.recursive is not _no_recurse_symlinks + if follow_symlinks: + while parts and parts[-1] not in _special_parts: + part += self.sep + parts.pop() + + match = None if part == '**' else self.compile(part) + dir_only = bool(parts) + select_next = self.selector(parts) + + def select_recursive(path, exists=False): + path = self.add_slash(path) + match_pos = len(str(path)) + if match is None or match(str(path), match_pos): + yield from select_next(path, exists) + stack = [path] + while stack: + yield from select_recursive_step(stack, match_pos) + + def select_recursive_step(stack, match_pos): + path = stack.pop() + try: + # We must close the scandir() object before proceeding to + # avoid exhausting file descriptors when globbing deep trees. + with self.scandir(path) as scandir_it: + entries = list(scandir_it) + except OSError: + pass + else: + for entry in entries: + is_dir = False + try: + if entry.is_dir(follow_symlinks=follow_symlinks): + is_dir = True + except OSError: + pass + + if is_dir or not dir_only: + entry_path = self.parse_entry(entry) + if match is None or match(str(entry_path), match_pos): + if dir_only: + yield from select_next(entry_path, exists=True) + else: + # Optimization: directly yield the path if this is + # last pattern part. + yield entry_path + if is_dir: + stack.append(entry_path) + + return select_recursive + + def select_exists(self, path, exists=False): + """Yields the given path, if it exists. + """ + if exists: + # Optimization: this path is already known to exist, e.g. because + # it was returned from os.scandir(), so we skip calling lstat(). + yield path + else: + try: + self.lstat(path) + yield path + except OSError: + pass diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index c3b898a2131d5a..e77d5da78a1a05 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -5,6 +5,7 @@ operating systems. """ +import glob import io import ntpath import os @@ -23,7 +24,7 @@ except ImportError: grp = None -from . import _abc, _glob +from . import _abc __all__ = [ @@ -111,7 +112,7 @@ class PurePath(_abc.PurePathBase): '_hash', ) parser = os.path - _globber = _glob.Globber + _globber = glob._Globber def __new__(cls, *args, **kwargs): """Construct a PurePath from one or several strings and or existing diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index ccfb525191e242..de8f6de74c6642 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -12,12 +12,11 @@ """ import functools +import glob import operator from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO -from . import _glob - # # Internals # @@ -43,12 +42,8 @@ def _ignore_error(exception): def _is_case_sensitive(parser): return parser.normcase('Aa') == 'Aa' -# -# Globbing helpers -# - -class Globber(_glob.Globber): +class Globber(glob._Globber): lstat = operator.methodcaller('lstat') scandir = operator.methodcaller('_scandir') add_slash = operator.methodcaller('joinpath', '') @@ -699,7 +694,7 @@ def _glob_selector(self, parts, case_sensitive, recurse_symlinks): return iter([]) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) - recursive = True if recurse_symlinks else _glob.no_recurse_symlinks + recursive = True if recurse_symlinks else glob._no_recurse_symlinks globber = self._globber(self.parser.sep, case_sensitive, recursive) return globber.selector(parts) diff --git a/Lib/pathlib/_glob.py b/Lib/pathlib/_glob.py deleted file mode 100644 index 6f7abb15cee5a1..00000000000000 --- a/Lib/pathlib/_glob.py +++ /dev/null @@ -1,191 +0,0 @@ -import functools -import operator -import os - - -re = glob = None -special_parts = ('', '.', '..') -no_recurse_symlinks = object() - - -@functools.lru_cache(maxsize=512) -def compile_pattern(pat, sep, case_sensitive, recursive=True): - """Compile given glob pattern to a re.Pattern object (observing case - sensitivity).""" - global re, glob - if re is None: - import re, glob - - flags = re.NOFLAG if case_sensitive else re.IGNORECASE - regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep) - return re.compile(regex, flags=flags).match - - -class Globber: - """Class providing shell-style pattern matching and globbing. - """ - - def __init__(self, sep, case_sensitive, recursive=False): - self.sep = sep - self.case_sensitive = case_sensitive - self.recursive = recursive - - # Low-level methods - - lstat = staticmethod(os.lstat) - scandir = staticmethod(os.scandir) - parse_entry = operator.attrgetter('path') - concat_path = operator.add - - if os.name == 'nt': - def add_slash(self, pathname): - tail = os.path.splitroot(pathname)[2] - if not tail or tail[-1] in '\\/': - return pathname - return f'{pathname}\\' - else: - def add_slash(self, pathname): - if not pathname or pathname[-1] == '/': - return pathname - return f'{pathname}/' - - # High-level methods - - def compile(self, pat): - return compile_pattern(pat, self.sep, self.case_sensitive, self.recursive) - - def selector(self, parts): - """Returns a function that selects from a given path, walking and - filtering according to the glob-style pattern parts in *parts*. - """ - if not parts: - return self.select_exists - part = parts.pop() - if self.recursive and part == '**': - selector = self.recursive_selector - elif part in special_parts: - selector = self.special_selector - else: - selector = self.wildcard_selector - return selector(part, parts) - - def special_selector(self, part, parts): - """Returns a function that selects special children of the given path. - """ - select_next = self.selector(parts) - - def select_special(path, exists=False): - path = self.concat_path(self.add_slash(path), part) - return select_next(path, exists) - return select_special - - def wildcard_selector(self, part, parts): - """Returns a function that selects direct children of a given path, - filtering by pattern. - """ - - match = None if part == '*' else self.compile(part) - dir_only = bool(parts) - if dir_only: - select_next = self.selector(parts) - - def select_wildcard(path, exists=False): - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: - entries = list(scandir_it) - for entry in entries: - if match is None or match(entry.name): - if dir_only: - try: - if not entry.is_dir(): - continue - except OSError: - continue - entry_path = self.parse_entry(entry) - if dir_only: - yield from select_next(entry_path, exists=True) - else: - yield entry_path - except OSError: - pass - return select_wildcard - - def recursive_selector(self, part, parts): - """Returns a function that selects a given path and all its children, - recursively, filtering by pattern. - """ - # Optimization: consume following '**' parts, which have no effect. - while parts and parts[-1] == '**': - parts.pop() - - # Optimization: consume and join any following non-special parts here, - # rather than leaving them for the next selector. They're used to - # build a regular expression, which we use to filter the results of - # the recursive walk. As a result, non-special pattern segments - # following a '**' wildcard don't require additional filesystem access - # to expand. - follow_symlinks = self.recursive is not no_recurse_symlinks - if follow_symlinks: - while parts and parts[-1] not in special_parts: - part += self.sep + parts.pop() - - match = None if part == '**' else self.compile(part) - dir_only = bool(parts) - select_next = self.selector(parts) - - def select_recursive(path, exists=False): - path = self.add_slash(path) - match_pos = len(str(path)) - if match is None or match(str(path), match_pos): - yield from select_next(path, exists) - stack = [path] - while stack: - yield from select_recursive_step(stack, match_pos) - - def select_recursive_step(stack, match_pos): - path = stack.pop() - try: - # We must close the scandir() object before proceeding to - # avoid exhausting file descriptors when globbing deep trees. - with self.scandir(path) as scandir_it: - entries = list(scandir_it) - except OSError: - pass - else: - for entry in entries: - is_dir = False - try: - if entry.is_dir(follow_symlinks=follow_symlinks): - is_dir = True - except OSError: - pass - - if is_dir or not dir_only: - entry_path = self.parse_entry(entry) - if match is None or match(str(entry_path), match_pos): - if dir_only: - yield from select_next(entry_path, exists=True) - else: - # Optimization: directly yield the path if this is - # last pattern part. - yield entry_path - if is_dir: - stack.append(entry_path) - - return select_recursive - - def select_exists(self, path, exists=False): - """Yields the given path, if it exists. - """ - if exists: - # Optimization: this path is already known to exist, e.g. because - # it was returned from os.scandir(), so we skip calling lstat(). - yield path - else: - try: - self.lstat(path) - yield path - except OSError: - pass From d6314ac9af650bf3b072c75248887b6ddf8c836f Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 22:22:03 +0100 Subject: [PATCH 3/8] Fix handling of missing root path. --- Lib/pathlib/__init__.py | 2 ++ Lib/pathlib/_abc.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index e77d5da78a1a05..65455cbb185de6 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -619,6 +619,8 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): # GH-65238: pathlib doesn't preserve trailing slash. Add it back. parts.append('') parts.reverse() + if not self.is_dir(): + return iter([]) select = self._glob_selector(parts, case_sensitive, recurse_symlinks) return map(self.with_segments, select(str(self), exists=True)) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index de8f6de74c6642..c6662218da869e 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -690,8 +690,6 @@ def _make_child_relpath(self, name): return self.joinpath(name) def _glob_selector(self, parts, case_sensitive, recurse_symlinks): - if not self.is_dir(): - return iter([]) if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) recursive = True if recurse_symlinks else glob._no_recurse_symlinks @@ -707,6 +705,8 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): anchor, parts = pattern._stack if anchor: raise NotImplementedError("Non-relative patterns are unsupported") + if not self.is_dir(): + return iter([]) select = self._glob_selector(parts, case_sensitive, recurse_symlinks) return select(self, exists=True) From 8696ca0583a1faaea0a0dec47aca56dbcefbab43 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 6 Apr 2024 22:41:11 +0100 Subject: [PATCH 4/8] More precise error handling --- Lib/glob.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index e773c3acf32259..4a67a4a61a0619 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -403,6 +403,9 @@ def select_wildcard(path, exists=False): # avoid exhausting file descriptors when globbing deep trees. with self.scandir(path) as scandir_it: entries = list(scandir_it) + except OSError: + pass + else: for entry in entries: if match is None or match(entry.name): if dir_only: @@ -416,8 +419,6 @@ def select_wildcard(path, exists=False): yield from select_next(entry_path, exists=True) else: yield entry_path - except OSError: - pass return select_wildcard def recursive_selector(self, part, parts): From 824f1f682d6b07ad650e1a610a9369ea38be0b17 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 7 Apr 2024 00:12:00 +0100 Subject: [PATCH 5/8] Ensure results are normalized. --- Lib/pathlib/__init__.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 65455cbb185de6..873ad414209e99 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -602,6 +602,25 @@ def _make_child_relpath(self, name): path._tail_cached = tail + [name] return path + def _make_glob_paths(self, paths): + """Yields normalized path objects from the given iterable of string + glob results.""" + sep = self.parser.sep + drive = self.drive + root = self.root + prefix = drive + root + prefix_len = len(prefix) + if not prefix_len and not self._tail_cached: + prefix_len = 2 # strip off leading "./" + for path in paths: + tail = path[prefix_len:].removesuffix(sep) + path = self.with_segments(path) + path._str = (prefix + tail) or '.' + path._drv = drive + path._root = root + path._tail_cached = tail.split(sep) + yield path + def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. @@ -622,7 +641,9 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): if not self.is_dir(): return iter([]) select = self._glob_selector(parts, case_sensitive, recurse_symlinks) - return map(self.with_segments, select(str(self), exists=True)) + paths = select(str(self), exists=True) + paths = self._make_glob_paths(paths) + return paths def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Recursively yield all existing files (of any kind, including From 60eb3d024fc10f0c3a1f47c620aa1f16e796ed21 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 7 Apr 2024 00:50:22 +0100 Subject: [PATCH 6/8] Speed up results normalization --- Lib/pathlib/__init__.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 873ad414209e99..294e9d66ec154e 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -606,19 +606,13 @@ def _make_glob_paths(self, paths): """Yields normalized path objects from the given iterable of string glob results.""" sep = self.parser.sep - drive = self.drive - root = self.root - prefix = drive + root - prefix_len = len(prefix) - if not prefix_len and not self._tail_cached: - prefix_len = 2 # strip off leading "./" - for path in paths: - tail = path[prefix_len:].removesuffix(sep) - path = self.with_segments(path) - path._str = (prefix + tail) or '.' - path._drv = drive - path._root = root - path._tail_cached = tail.split(sep) + prefix_len = len(self.anchor) + for path_str in paths: + if len(path_str) > prefix_len and path_str[-1] == sep: + # Strip trailing slash. + path_str = path_str[:-1] + path = self.with_segments(path_str) + path._str = path_str or '.' yield path def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): @@ -641,7 +635,11 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): if not self.is_dir(): return iter([]) select = self._glob_selector(parts, case_sensitive, recurse_symlinks) - paths = select(str(self), exists=True) + path = str(self) + paths = select(path, exists=True) + if path == '.': + # Strip leading './'. + paths = map(lambda p: p[2:], paths) paths = self._make_glob_paths(paths) return paths From 98dea961d14f2d8d65a53820c3163ee36d906982 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 7 Apr 2024 16:43:00 +0100 Subject: [PATCH 7/8] Define add_slash() in _Globber itself. --- Lib/glob.py | 30 +++++++++++++++--------------- Lib/pathlib/_abc.py | 6 ++++-- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Lib/glob.py b/Lib/glob.py index 4a67a4a61a0619..62cf0394e921d7 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -327,19 +327,6 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True): return re.compile(regex, flags=flags).match -if os.name == 'nt': - def _add_slash(pathname): - tail = os.path.splitroot(pathname)[2] - if not tail or tail[-1] in '\\/': - return pathname - return f'{pathname}\\' -else: - def _add_slash(pathname): - if not pathname or pathname[-1] == '/': - return pathname - return f'{pathname}/' - - class _Globber: """Class providing shell-style pattern matching and globbing. """ @@ -353,9 +340,22 @@ def __init__(self, sep, case_sensitive, recursive=False): lstat = staticmethod(os.lstat) scandir = staticmethod(os.scandir) - add_slash = staticmethod(_add_slash) - concat_path = operator.add parse_entry = operator.attrgetter('path') + concat_path = operator.add + + if os.name == 'nt': + @staticmethod + def add_slash(pathname): + tail = os.path.splitroot(pathname)[2] + if not tail or tail[-1] in '\\/': + return pathname + return f'{pathname}\\' + else: + @staticmethod + def add_slash(pathname): + if not pathname or pathname[-1] == '/': + return pathname + return f'{pathname}/' # High-level methods diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index c6662218da869e..553f797d75e793 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -48,12 +48,14 @@ class Globber(glob._Globber): scandir = operator.methodcaller('_scandir') add_slash = operator.methodcaller('joinpath', '') - def concat_path(self, path, text): + @staticmethod + def concat_path(path, text): """Appends text to the given path. """ return path.with_segments(path._raw_path + text) - def parse_entry(self, entry): + @staticmethod + def parse_entry(entry): """Returns the path of an entry yielded from scandir(). """ return entry From ebcd7fcca8db0a077d8cb2c4eea689de2c921a47 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 7 Apr 2024 18:25:16 +0100 Subject: [PATCH 8/8] Slightly speed up path renormalisation. --- Lib/pathlib/__init__.py | 58 ++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 294e9d66ec154e..88e3286d9b08dc 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -8,6 +8,7 @@ import glob import io import ntpath +import operator import os import posixpath import sys @@ -255,14 +256,17 @@ def _format_parsed_parts(cls, drv, root, tail): return cls.parser.sep.join(tail) def _from_parsed_parts(self, drv, root, tail): - path_str = self._format_parsed_parts(drv, root, tail) - path = self.with_segments(path_str) - path._str = path_str or '.' + path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail)) path._drv = drv path._root = root path._tail_cached = tail return path + def _from_parsed_string(self, path_str): + path = self.with_segments(path_str) + path._str = path_str or '.' + return path + @classmethod def _parse_path(cls, path): if not path: @@ -563,6 +567,17 @@ def write_text(self, data, encoding=None, errors=None, newline=None): encoding = io.text_encoding(encoding) return _abc.PathBase.write_text(self, data, encoding, errors, newline) + _remove_leading_dot = operator.itemgetter(slice(2, None)) + _remove_trailing_slash = operator.itemgetter(slice(-1)) + + def _filter_trailing_slash(self, paths): + sep = self.parser.sep + anchor_len = len(self.anchor) + for path_str in paths: + if len(path_str) > anchor_len and path_str[-1] == sep: + path_str = path_str[:-1] + yield path_str + def iterdir(self): """Yield path objects of the directory contents. @@ -602,19 +617,6 @@ def _make_child_relpath(self, name): path._tail_cached = tail + [name] return path - def _make_glob_paths(self, paths): - """Yields normalized path objects from the given iterable of string - glob results.""" - sep = self.parser.sep - prefix_len = len(self.anchor) - for path_str in paths: - if len(path_str) > prefix_len and path_str[-1] == sep: - # Strip trailing slash. - path_str = path_str[:-1] - path = self.with_segments(path_str) - path._str = path_str or '.' - yield path - def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. @@ -631,16 +633,20 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): if raw[-1] in (self.parser.sep, self.parser.altsep): # GH-65238: pathlib doesn't preserve trailing slash. Add it back. parts.append('') - parts.reverse() if not self.is_dir(): return iter([]) - select = self._glob_selector(parts, case_sensitive, recurse_symlinks) - path = str(self) - paths = select(path, exists=True) - if path == '.': - # Strip leading './'. - paths = map(lambda p: p[2:], paths) - paths = self._make_glob_paths(paths) + select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks) + root = str(self) + paths = select(root, exists=True) + + # Normalize results + if root == '.': + paths = map(self._remove_leading_dot, paths) + if parts[-1] == '': + paths = map(self._remove_trailing_slash, paths) + elif parts[-1] == '**': + paths = self._filter_trailing_slash(paths) + paths = map(self._from_parsed_string, paths) return paths def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): @@ -682,9 +688,7 @@ def absolute(self): # of joining, and we exploit the fact that getcwd() returns a # fully-normalized string by storing it in _str. This is used to # implement Path.cwd(). - result = self.with_segments(cwd) - result._str = cwd - return result + return self._from_parsed_string(cwd) drive, root, rel = os.path.splitroot(cwd) if not rel: return self._from_parsed_parts(drive, root, self._tail)