Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-115060: Speed up pathlib.Path.glob() by not scanning literal parts #117732

Merged
merged 4 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,10 @@ class _Globber:
"""Class providing shell-style pattern matching and globbing.
"""

def __init__(self, sep, case_sensitive, recursive=False):
def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False):
self.sep = sep
self.case_sensitive = case_sensitive
self.case_pedantic = case_pedantic
self.recursive = recursive

# Low-level methods
Expand Down Expand Up @@ -373,6 +374,8 @@ def selector(self, parts):
selector = self.recursive_selector
elif part in _special_parts:
selector = self.special_selector
elif not self.case_pedantic and magic_check.search(part) is None:
selector = self.literal_selector
else:
selector = self.wildcard_selector
return selector(part, parts)
Expand All @@ -387,6 +390,23 @@ def select_special(path, exists=False):
return select_next(path, exists)
return select_special

def literal_selector(self, part, parts):
"""Returns a function that selects a literal descendant of a path.
"""

# Optimization: consume and join any subsequent literal parts here,
# rather than leaving them for the next selector. This reduces the
# number of string concatenation operations and calls to add_slash().
while parts and magic_check.search(parts[-1]) is None:
part += self.sep + parts.pop()

select_next = self.selector(parts)

def select_literal(path, exists=False):
path = self.concat_path(self.add_slash(path), part)
return select_next(path, exists=False)
return select_literal

def wildcard_selector(self, part, parts):
"""Returns a function that selects direct children of a given path,
filtering by pattern.
Expand Down
8 changes: 7 additions & 1 deletion Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,8 +694,14 @@ def _make_child_relpath(self, name):
def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
case_pedantic = False
else:
# The user has expressed a case sensitivity choice, but we don't
# know the case sensitivity of the underlying filesystem, so we
# must use scandir() for everything, including non-wildcard parts.
case_pedantic = True
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
globber = self._globber(self.parser.sep, case_sensitive, recursive)
globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive)
return globber.selector(parts)

def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
Expand Down
21 changes: 12 additions & 9 deletions Lib/test/test_pathlib/test_pathlib_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1429,10 +1429,10 @@ def __repr__(self):
return "{}({!r})".format(self.__class__.__name__, self.as_posix())

def stat(self, *, follow_symlinks=True):
if follow_symlinks:
path = str(self.resolve())
if follow_symlinks or self.name in ('', '.', '..'):
path = str(self.resolve(strict=True))
else:
path = str(self.parent.resolve() / self.name)
path = str(self.parent.resolve(strict=True) / self.name)
if path in self._files:
st_mode = stat.S_IFREG
elif path in self._directories:
Expand Down Expand Up @@ -1741,8 +1741,9 @@ def _check(glob, expected):
def test_glob_posix(self):
P = self.cls
p = P(self.base)
q = p / "FILEa"
given = set(p.glob("FILEa"))
expect = set()
expect = {q} if q.exists() else set()
self.assertEqual(given, expect)
self.assertEqual(set(p.glob("FILEa*")), set())

Expand All @@ -1753,8 +1754,6 @@ def test_glob_windows(self):
self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") })
self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") })
self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") })
self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})

def test_glob_empty_pattern(self):
P = self.cls
Expand Down Expand Up @@ -1857,8 +1856,9 @@ def _check(path, glob, expected):
def test_rglob_posix(self):
P = self.cls
p = P(self.base, "dirC")
q = p / "dirD" / "FILEd"
given = set(p.rglob("FILEd"))
expect = set()
expect = {q} if q.exists() else set()
self.assertEqual(given, expect)
self.assertEqual(set(p.rglob("FILEd*")), set())

Expand All @@ -1868,7 +1868,6 @@ def test_rglob_windows(self):
p = P(self.base, "dirC")
self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") })
self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") })
self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})

@needs_symlinks
def test_rglob_recurse_symlinks_common(self):
Expand Down Expand Up @@ -1931,7 +1930,11 @@ def test_glob_dotdot(self):
self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") })
self.assertEqual(set(p.glob("dirA/../file*/..")), set())
self.assertEqual(set(p.glob("../xyzzy")), set())
self.assertEqual(set(p.glob("xyzzy/..")), set())
if self.cls.parser is posixpath:
self.assertEqual(set(p.glob("xyzzy/..")), set())
else:
# ".." segments are normalized first on Windows, so this path is stat()able.
self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") })
self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)})

@needs_symlinks
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Speed up :meth:`pathlib.Path.glob` by not scanning directories for
non-wildcard pattern segments.
Loading