From f1b3b0ecbf6145eb79c9f11147c773c6b99b86d4 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 10 Apr 2024 22:34:12 +0100 Subject: [PATCH 1/4] GH-115060: Speed up `pathlib.Path.glob()` by not scanning literal parts Don't bother calling `os.scandir()` to scan for literal pattern segments, like `foo` in `foo/*.py`. Instead, append the segment(s) as-is and call through to the next selector with `exists=False`, which signals that the path might not exist. Subsequent selectors will call `os.scandir()` or `os.lstat()` to filter out missing paths as needed. --- Lib/glob.py | 22 ++++++++++++++++++- Lib/pathlib/_abc.py | 5 ++++- Lib/test/test_pathlib/test_pathlib_abc.py | 6 ++--- ...-04-10-22-35-24.gh-issue-115060.XEVuOb.rst | 2 ++ 4 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst diff --git a/Lib/glob.py b/Lib/glob.py index 62cf0394e921d7..50b80c8d59dc70 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -331,9 +331,10 @@ class _Globber: """Class providing shell-style pattern matching and globbing. """ - def __init__(self, sep, case_sensitive, recursive=False): + def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False): self.sep = sep self.case_sensitive = case_sensitive + self.case_pedantic = case_pedantic self.recursive = recursive # Low-level methods @@ -373,6 +374,8 @@ def selector(self, parts): selector = self.recursive_selector elif part in _special_parts: selector = self.special_selector + elif not self.case_pedantic and magic_check.search(part) is None: + selector = self.literal_selector else: selector = self.wildcard_selector return selector(part, parts) @@ -387,6 +390,23 @@ def select_special(path, exists=False): return select_next(path, exists) return select_special + def literal_selector(self, part, parts): + """Returns a function that selects a literal descendant of a path. + """ + + # Optimization: consume and join any subsequent literal parts here, + # rather than leaving them for the next selector. This reduces the + # number of string concatenation operations and calls to add_slash(). + while parts and magic_check.search(parts[-1]) is None: + part += self.sep + parts.pop() + + select_next = self.selector(parts) + + def select_literal(path, exists=False): + path = self.concat_path(self.add_slash(path), part) + return select_next(path, exists=False) + return select_literal + def wildcard_selector(self, part, parts): """Returns a function that selects direct children of a given path, filtering by pattern. diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 553f797d75e793..27735618a43a00 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -694,8 +694,11 @@ def _make_child_relpath(self, name): def _glob_selector(self, parts, case_sensitive, recurse_symlinks): if case_sensitive is None: case_sensitive = _is_case_sensitive(self.parser) + case_pedantic = False + else: + case_pedantic = True recursive = True if recurse_symlinks else glob._no_recurse_symlinks - globber = self._globber(self.parser.sep, case_sensitive, recursive) + globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive) return globber.selector(parts) def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True): diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 336115cf0fead2..9e532fb4d17f4b 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1429,10 +1429,10 @@ def __repr__(self): return "{}({!r})".format(self.__class__.__name__, self.as_posix()) def stat(self, *, follow_symlinks=True): - if follow_symlinks: - path = str(self.resolve()) + if follow_symlinks or self.name in ('', '.', '..'): + path = str(self.resolve(strict=True)) else: - path = str(self.parent.resolve() / self.name) + path = str(self.parent.resolve(strict=True) / self.name) if path in self._files: st_mode = stat.S_IFREG elif path in self._directories: diff --git a/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst b/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst new file mode 100644 index 00000000000000..b5084a0e86c74f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst @@ -0,0 +1,2 @@ +Speed up :meth:`pathlib.Path.glob` by not scanning directories for +non-wildcard pattern segments. From 4c48dc098fb1604ac3a495982e1884432c0b34a3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 10 Apr 2024 23:03:09 +0100 Subject: [PATCH 2/4] Fix tests, add comment. --- Lib/pathlib/_abc.py | 3 +++ Lib/test/test_pathlib/test_pathlib_abc.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 27735618a43a00..1597ae99863600 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -696,6 +696,9 @@ def _glob_selector(self, parts, case_sensitive, recurse_symlinks): case_sensitive = _is_case_sensitive(self.parser) case_pedantic = False else: + # The user has expressed a case sensitivity choice, but we don't + # know the case sensitivity of the underlying filesystem, so we + # must use scandir() for everything, including non-wildcard parts. case_pedantic = True recursive = True if recurse_symlinks else glob._no_recurse_symlinks globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 9e532fb4d17f4b..69ebc6557a958d 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1741,8 +1741,9 @@ def _check(glob, expected): def test_glob_posix(self): P = self.cls p = P(self.base) + q = p / "FILEa" given = set(p.glob("FILEa")) - expect = set() + expect = {q} if q.exists() else set() self.assertEqual(given, expect) self.assertEqual(set(p.glob("FILEa*")), set()) @@ -1753,8 +1754,6 @@ def test_glob_windows(self): self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) - self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"}) - self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"}) def test_glob_empty_pattern(self): P = self.cls @@ -1857,8 +1856,9 @@ def _check(path, glob, expected): def test_rglob_posix(self): P = self.cls p = P(self.base, "dirC") + q = p / "dirC" / "FILEd" given = set(p.rglob("FILEd")) - expect = set() + expect = {q} if q.exists() else set() self.assertEqual(given, expect) self.assertEqual(set(p.rglob("FILEd*")), set()) @@ -1868,7 +1868,6 @@ def test_rglob_windows(self): p = P(self.base, "dirC") self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) - self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"}) @needs_symlinks def test_rglob_recurse_symlinks_common(self): @@ -1931,7 +1930,11 @@ def test_glob_dotdot(self): self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") }) self.assertEqual(set(p.glob("dirA/../file*/..")), set()) self.assertEqual(set(p.glob("../xyzzy")), set()) - self.assertEqual(set(p.glob("xyzzy/..")), set()) + if self.cls.parser is posixpath: + self.assertEqual(set(p.glob("xyzzy/..")), set()) + else: + # ".." segments are normalized first on Windows, so this path is stat()able. + self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "zyzzy", "..") }) self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)}) @needs_symlinks From 0e7aee7b93df01d4f12f7b37bf978cc41976c3f0 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 11 Apr 2024 00:35:19 +0100 Subject: [PATCH 3/4] More test fixes --- Lib/test/test_pathlib/test_pathlib_abc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 69ebc6557a958d..621fa05e6681ad 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1856,7 +1856,7 @@ def _check(path, glob, expected): def test_rglob_posix(self): P = self.cls p = P(self.base, "dirC") - q = p / "dirC" / "FILEd" + q = p / "dirC" / "dirD" / "FILEd" given = set(p.rglob("FILEd")) expect = {q} if q.exists() else set() self.assertEqual(given, expect) @@ -1934,7 +1934,7 @@ def test_glob_dotdot(self): self.assertEqual(set(p.glob("xyzzy/..")), set()) else: # ".." segments are normalized first on Windows, so this path is stat()able. - self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "zyzzy", "..") }) + self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") }) self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)}) @needs_symlinks From c4e3edc27285f617538d9447ff31665f9f0bf59a Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 11 Apr 2024 00:52:58 +0100 Subject: [PATCH 4/4] Once more with feeling --- Lib/test/test_pathlib/test_pathlib_abc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 621fa05e6681ad..6656b032cde28e 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1856,7 +1856,7 @@ def _check(path, glob, expected): def test_rglob_posix(self): P = self.cls p = P(self.base, "dirC") - q = p / "dirC" / "dirD" / "FILEd" + q = p / "dirD" / "FILEd" given = set(p.rglob("FILEd")) expect = {q} if q.exists() else set() self.assertEqual(given, expect)