Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-115060: Speed up pathlib.Path.glob() by skipping directory scanning #116152

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1004,9 +1004,7 @@ call fails (for example because the path doesn't exist).
.. seealso::
:ref:`pathlib-pattern-language` documentation.

This method calls :meth:`Path.is_dir` on the top-level directory and
propagates any :exc:`OSError` exception that is raised. Subsequent
:exc:`OSError` exceptions from scanning directories are suppressed.
This method suppresses :exc:`OSError` exceptions.

By default, or when the *case_sensitive* keyword-only argument is set to
``None``, this method matches paths using platform-specific casing rules:
Expand Down
97 changes: 86 additions & 11 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def _ignore_error(exception):
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)


def _is_wildcard_pattern(pat):
"""Whether this pattern needs actual matching using fnmatch, or can be
looked up directly as a file."""
return "*" in pat or "?" in pat or "[" in pat


@functools.cache
def _is_case_sensitive(pathmod):
return pathmod.normcase('Aa') == 'Aa'
Expand All @@ -60,16 +66,47 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True):
return re.compile(regex, flags=flags).match


def _select_special(paths, part):
"""Yield special literal children of the given paths."""
def _select_literal(paths, part):
"""Yield literal children of the given paths."""
for path in paths:
yield path._make_child_relpath(part)


def _select_directories(paths):
"""Yield the given paths, filtering out non-directories."""
for path in paths:
try:
if path.is_dir():
yield path
except OSError:
pass


def _deselect_missing(paths):
"""Yield the given paths, filtering out missing files."""
for path in paths:
try:
path.stat(follow_symlinks=False)
yield path
except OSError:
pass


def _deselect_symlinks(paths, dir_only, follow_symlinks):
"""Yield the given paths, filtering out symlinks."""
for path in paths:
if follow_symlinks or not dir_only:
yield path
else:
try:
if not path.is_symlink():
yield path
except OSError:
pass


def _select_children(parent_paths, dir_only, follow_symlinks, match):
"""Yield direct children of given paths, filtering by name and type."""
if follow_symlinks is None:
follow_symlinks = True
for parent_path in parent_paths:
try:
# We must close the scandir() object before proceeding to
Expand All @@ -96,8 +133,6 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
"""Yield given paths and all their children, recursively, filtering by
string and type.
"""
if follow_symlinks is None:
follow_symlinks = False
for parent_path in parent_paths:
if match is not None:
# If we're filtering paths through a regex, record the length of
Expand Down Expand Up @@ -799,16 +834,33 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
# TODO: evaluate case-sensitivity of each directory in _select_children().
case_sensitive = _is_case_sensitive(self.pathmod)

# User doesn't care about case sensitivity, so for non-wildcard
# patterns like "foo/bar" we can stat() once rather than scandir()
# twice. Returned paths may not match real filesystem case.
case_preserving = False
else:
# Explicit case sensitivity choice provided. We must use scandir()
# to retrieve and match filenames with real filesystem case.
case_preserving = True

if follow_symlinks is None:
# Legacy behaviour: follow symlinks unless we're expanding '**'.
follow_symlinks = True
follow_symlinks_recursive = False
else:
follow_symlinks_recursive = follow_symlinks

stack = pattern._pattern_stack
specials = ('', '.', '..')
check_paths = True
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
paths = iter([self])
while stack:
part = stack.pop()
if part in specials:
# Join special component (e.g. '..') onto paths.
paths = _select_special(paths, part)
paths = _select_literal(paths, part)

elif part == '**':
# Consume following '**' components, which have no effect.
Expand All @@ -818,16 +870,21 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
# Consume following non-special components, provided we're
# treating symlinks consistently. Each component is joined
# onto 'part', which is used to generate an re.Pattern object.
if follow_symlinks is not None:
if follow_symlinks == follow_symlinks_recursive:
while stack and stack[-1] not in specials:
part += sep + stack.pop()

# If the previous loop consumed pattern components, compile an
# re.Pattern object based on those components.
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None

# Ensure directories exist.
if check_paths:
paths = _select_directories(paths)
check_paths = False

# Recursively walk directories, filtering by type and regex.
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)
paths = _select_recursive(paths, bool(stack), follow_symlinks_recursive, match)

# De-duplicate if we've already seen a '**' component.
if deduplicate_paths:
Expand All @@ -837,13 +894,31 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")

else:
elif case_preserving or _is_wildcard_pattern(part):
# If the pattern component isn't '*', compile an re.Pattern
# object based on the component.
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None

# Iterate over directories' children filtering by type and regex.
paths = _select_children(paths, bool(stack), follow_symlinks, match)

# Paths are known to exist: they're directory children from _scandir()
check_paths = False

else:
# Join non-wildcard component onto paths.
paths = _select_literal(paths, part)

# Filter out non-symlinks if requested.
paths = _deselect_symlinks(paths, bool(stack), follow_symlinks)

# Paths might not exist; mark them to be checked.
check_paths = True

if check_paths:
# Filter out paths that don't exist.
paths = _deselect_missing(paths)

return paths

def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
Expand Down
34 changes: 26 additions & 8 deletions Lib/test/test_pathlib/test_pathlib_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,10 +1431,10 @@ def __repr__(self):
return "{}({!r})".format(self.__class__.__name__, self.as_posix())

def stat(self, *, follow_symlinks=True):
if follow_symlinks:
path = str(self.resolve())
if follow_symlinks or not self.name or self.name == '.' or self.name == '..':
path = str(self.resolve(strict=True))
else:
path = str(self.parent.resolve() / self.name)
path = str(self.parent.resolve(strict=True) / self.name)
if path in self._files:
st_mode = stat.S_IFREG
elif path in self._directories:
Expand Down Expand Up @@ -1752,10 +1752,10 @@ def test_glob_posix(self):
def test_glob_windows(self):
P = self.cls
p = P(self.base)
self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") })
self.assertEqual(set(p.glob("FILEa")), { P(self.base, "FILEa") })
self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") })
self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") })
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\FILEa"})
self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})

def test_glob_empty_pattern(self):
Expand Down Expand Up @@ -1843,6 +1843,11 @@ def _check(path, glob, expected):
_check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD/"])
_check(p, "*/dirD/**", ["dirC/dirD/", "dirC/dirD/fileD"])
_check(p, "*/dirD/**/", ["dirC/dirD/"])
_check(p, "linkA", ["linkA"])
_check(p, "linkB", ["linkB"])
_check(p, "linkB/fileB", [])
_check(p, "dirA/linkC", ["dirA/linkC"])
_check(p, "dirA/linkC/fileB", [])

def test_rglob_common(self):
def _check(glob, expected):
Expand Down Expand Up @@ -1903,9 +1908,9 @@ def test_rglob_posix(self):
def test_rglob_windows(self):
P = self.cls
p = P(self.base, "dirC")
self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") })
self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/FILEd") })
self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") })
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\FILEd"})

@needs_symlinks
def test_rglob_follow_symlinks_common(self):
Expand Down Expand Up @@ -1993,9 +1998,22 @@ def test_glob_dotdot(self):
self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") })
self.assertEqual(set(p.glob("dirA/../file*/..")), set())
self.assertEqual(set(p.glob("../xyzzy")), set())
self.assertEqual(set(p.glob("xyzzy/..")), set())
self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)})

@needs_posix
def test_glob_dotdot_posix(self):
P = self.cls
p = P(self.base)
self.assertEqual(set(p.glob("xyzzy/..")), set())

@needs_windows
def test_glob_dotdot_windows(self):
# '..' segments are resolved first on Windows, so
# 'xyzzy' doesn't need to exist.
P = self.cls
p = P(self.base)
self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") })

@needs_symlinks
def test_glob_permissions(self):
# See bpo-38894
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Speed up handling of non-wildcard pattern segments in
:meth:`pathlib.Path.glob`.
Loading