diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 4b461a5d4a2949..3f4d2c16f2eb2b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1004,9 +1004,7 @@ call fails (for example because the path doesn't exist). .. seealso:: :ref:`pathlib-pattern-language` documentation. - This method calls :meth:`Path.is_dir` on the top-level directory and - propagates any :exc:`OSError` exception that is raised. Subsequent - :exc:`OSError` exceptions from scanning directories are suppressed. + This method suppresses :exc:`OSError` exceptions. By default, or when the *case_sensitive* keyword-only argument is set to ``None``, this method matches paths using platform-specific casing rules: diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 44fea525b6cac7..599229c3bb2f05 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -36,6 +36,12 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +def _is_wildcard_pattern(pat): + """Whether this pattern needs actual matching using fnmatch, or can be + looked up directly as a file.""" + return "*" in pat or "?" in pat or "[" in pat + + @functools.cache def _is_case_sensitive(pathmod): return pathmod.normcase('Aa') == 'Aa' @@ -60,16 +66,47 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True): return re.compile(regex, flags=flags).match -def _select_special(paths, part): - """Yield special literal children of the given paths.""" +def _select_literal(paths, part): + """Yield literal children of the given paths.""" for path in paths: yield path._make_child_relpath(part) +def _select_directories(paths): + """Yield the given paths, filtering out non-directories.""" + for path in paths: + try: + if path.is_dir(): + yield path + except OSError: + pass + + +def _deselect_missing(paths): + """Yield the given paths, filtering out missing files.""" + for path in paths: + try: + path.stat(follow_symlinks=False) + yield path + except OSError: + pass + + +def _deselect_symlinks(paths, dir_only, follow_symlinks): + """Yield the given paths, filtering out symlinks.""" + for path in paths: + if follow_symlinks or not dir_only: + yield path + else: + try: + if not path.is_symlink(): + yield path + except OSError: + pass + + def _select_children(parent_paths, dir_only, follow_symlinks, match): """Yield direct children of given paths, filtering by name and type.""" - if follow_symlinks is None: - follow_symlinks = True for parent_path in parent_paths: try: # We must close the scandir() object before proceeding to @@ -96,8 +133,6 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): """Yield given paths and all their children, recursively, filtering by string and type. """ - if follow_symlinks is None: - follow_symlinks = False for parent_path in parent_paths: if match is not None: # If we're filtering paths through a regex, record the length of @@ -799,16 +834,33 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # TODO: evaluate case-sensitivity of each directory in _select_children(). case_sensitive = _is_case_sensitive(self.pathmod) + # User doesn't care about case sensitivity, so for non-wildcard + # patterns like "foo/bar" we can stat() once rather than scandir() + # twice. Returned paths may not match real filesystem case. + case_preserving = False + else: + # Explicit case sensitivity choice provided. We must use scandir() + # to retrieve and match filenames with real filesystem case. + case_preserving = True + + if follow_symlinks is None: + # Legacy behaviour: follow symlinks unless we're expanding '**'. + follow_symlinks = True + follow_symlinks_recursive = False + else: + follow_symlinks_recursive = follow_symlinks + stack = pattern._pattern_stack specials = ('', '.', '..') + check_paths = True deduplicate_paths = False sep = self.pathmod.sep - paths = iter([self] if self.is_dir() else []) + paths = iter([self]) while stack: part = stack.pop() if part in specials: # Join special component (e.g. '..') onto paths. - paths = _select_special(paths, part) + paths = _select_literal(paths, part) elif part == '**': # Consume following '**' components, which have no effect. @@ -818,7 +870,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # Consume following non-special components, provided we're # treating symlinks consistently. Each component is joined # onto 'part', which is used to generate an re.Pattern object. - if follow_symlinks is not None: + if follow_symlinks == follow_symlinks_recursive: while stack and stack[-1] not in specials: part += sep + stack.pop() @@ -826,8 +878,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # re.Pattern object based on those components. match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None + # Ensure directories exist. + if check_paths: + paths = _select_directories(paths) + check_paths = False + # Recursively walk directories, filtering by type and regex. - paths = _select_recursive(paths, bool(stack), follow_symlinks, match) + paths = _select_recursive(paths, bool(stack), follow_symlinks_recursive, match) # De-duplicate if we've already seen a '**' component. if deduplicate_paths: @@ -837,13 +894,31 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: + elif case_preserving or _is_wildcard_pattern(part): # If the pattern component isn't '*', compile an re.Pattern # object based on the component. match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None # Iterate over directories' children filtering by type and regex. paths = _select_children(paths, bool(stack), follow_symlinks, match) + + # Paths are known to exist: they're directory children from _scandir() + check_paths = False + + else: + # Join non-wildcard component onto paths. + paths = _select_literal(paths, part) + + # Filter out non-symlinks if requested. + paths = _deselect_symlinks(paths, bool(stack), follow_symlinks) + + # Paths might not exist; mark them to be checked. + check_paths = True + + if check_paths: + # Filter out paths that don't exist. + paths = _deselect_missing(paths) + return paths def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 5bfb76f85c7909..8b0a1ed2baba38 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1431,10 +1431,10 @@ def __repr__(self): return "{}({!r})".format(self.__class__.__name__, self.as_posix()) def stat(self, *, follow_symlinks=True): - if follow_symlinks: - path = str(self.resolve()) + if follow_symlinks or not self.name or self.name == '.' or self.name == '..': + path = str(self.resolve(strict=True)) else: - path = str(self.parent.resolve() / self.name) + path = str(self.parent.resolve(strict=True) / self.name) if path in self._files: st_mode = stat.S_IFREG elif path in self._directories: @@ -1752,10 +1752,10 @@ def test_glob_posix(self): def test_glob_windows(self): P = self.cls p = P(self.base) - self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) + self.assertEqual(set(p.glob("FILEa")), { P(self.base, "FILEa") }) self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) - self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"}) + self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\FILEa"}) self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"}) def test_glob_empty_pattern(self): @@ -1843,6 +1843,11 @@ def _check(path, glob, expected): _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD/"]) _check(p, "*/dirD/**", ["dirC/dirD/", "dirC/dirD/fileD"]) _check(p, "*/dirD/**/", ["dirC/dirD/"]) + _check(p, "linkA", ["linkA"]) + _check(p, "linkB", ["linkB"]) + _check(p, "linkB/fileB", []) + _check(p, "dirA/linkC", ["dirA/linkC"]) + _check(p, "dirA/linkC/fileB", []) def test_rglob_common(self): def _check(glob, expected): @@ -1903,9 +1908,9 @@ def test_rglob_posix(self): def test_rglob_windows(self): P = self.cls p = P(self.base, "dirC") - self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) + self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/FILEd") }) self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) - self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"}) + self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\FILEd"}) @needs_symlinks def test_rglob_follow_symlinks_common(self): @@ -1993,9 +1998,22 @@ def test_glob_dotdot(self): self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") }) self.assertEqual(set(p.glob("dirA/../file*/..")), set()) self.assertEqual(set(p.glob("../xyzzy")), set()) - self.assertEqual(set(p.glob("xyzzy/..")), set()) self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)}) + @needs_posix + def test_glob_dotdot_posix(self): + P = self.cls + p = P(self.base) + self.assertEqual(set(p.glob("xyzzy/..")), set()) + + @needs_windows + def test_glob_dotdot_windows(self): + # '..' segments are resolved first on Windows, so + # 'xyzzy' doesn't need to exist. + P = self.cls + p = P(self.base) + self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") }) + @needs_symlinks def test_glob_permissions(self): # See bpo-38894 diff --git a/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst b/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst new file mode 100644 index 00000000000000..6e612cb0d0ed2d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst @@ -0,0 +1,2 @@ +Speed up handling of non-wildcard pattern segments in +:meth:`pathlib.Path.glob`.