Skip to content

Commit a40f557

Browse files
authored
GH-116380: Move pathlib globbing implementation into pathlib._glob (#118562)
Moving this code under the `pathlib` package makes it quite a lot easier to backport in the `pathlib-abc` PyPI package. It was a bit foolish of me to add it to `glob` in the first place. Also add `translate()` to `__all__` in `glob`. This function is new in 3.13, so there's no NEWS needed.
1 parent 37d0950 commit a40f557

File tree

4 files changed

+314
-309
lines changed

4 files changed

+314
-309
lines changed

Lib/glob.py

+3-303
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22

33
import contextlib
44
import os
5-
import re
65
import fnmatch
7-
import functools
86
import itertools
9-
import operator
107
import stat
118
import sys
129

13-
__all__ = ["glob", "iglob", "escape"]
10+
from pathlib._glob import translate, magic_check, magic_check_bytes
11+
12+
__all__ = ["glob", "iglob", "escape", "translate"]
1413

1514
def glob(pathname, *, root_dir=None, dir_fd=None, recursive=False,
1615
include_hidden=False):
@@ -226,9 +225,6 @@ def _join(dirname, basename):
226225
return dirname or basename
227226
return os.path.join(dirname, basename)
228227

229-
magic_check = re.compile('([*?[])')
230-
magic_check_bytes = re.compile(b'([*?[])')
231-
232228
def has_magic(s):
233229
if isinstance(s, bytes):
234230
match = magic_check_bytes.search(s)
@@ -258,300 +254,4 @@ def escape(pathname):
258254
return drive + pathname
259255

260256

261-
_special_parts = ('', '.', '..')
262257
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
263-
_no_recurse_symlinks = object()
264-
265-
266-
def translate(pat, *, recursive=False, include_hidden=False, seps=None):
267-
"""Translate a pathname with shell wildcards to a regular expression.
268-
269-
If `recursive` is true, the pattern segment '**' will match any number of
270-
path segments.
271-
272-
If `include_hidden` is true, wildcards can match path segments beginning
273-
with a dot ('.').
274-
275-
If a sequence of separator characters is given to `seps`, they will be
276-
used to split the pattern into segments and match path separators. If not
277-
given, os.path.sep and os.path.altsep (where available) are used.
278-
"""
279-
if not seps:
280-
if os.path.altsep:
281-
seps = (os.path.sep, os.path.altsep)
282-
else:
283-
seps = os.path.sep
284-
escaped_seps = ''.join(map(re.escape, seps))
285-
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
286-
not_sep = f'[^{escaped_seps}]'
287-
if include_hidden:
288-
one_last_segment = f'{not_sep}+'
289-
one_segment = f'{one_last_segment}{any_sep}'
290-
any_segments = f'(?:.+{any_sep})?'
291-
any_last_segments = '.*'
292-
else:
293-
one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
294-
one_segment = f'{one_last_segment}{any_sep}'
295-
any_segments = f'(?:{one_segment})*'
296-
any_last_segments = f'{any_segments}(?:{one_last_segment})?'
297-
298-
results = []
299-
parts = re.split(any_sep, pat)
300-
last_part_idx = len(parts) - 1
301-
for idx, part in enumerate(parts):
302-
if part == '*':
303-
results.append(one_segment if idx < last_part_idx else one_last_segment)
304-
elif recursive and part == '**':
305-
if idx < last_part_idx:
306-
if parts[idx + 1] != '**':
307-
results.append(any_segments)
308-
else:
309-
results.append(any_last_segments)
310-
else:
311-
if part:
312-
if not include_hidden and part[0] in '*?':
313-
results.append(r'(?!\.)')
314-
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
315-
if idx < last_part_idx:
316-
results.append(any_sep)
317-
res = ''.join(results)
318-
return fr'(?s:{res})\Z'
319-
320-
321-
@functools.lru_cache(maxsize=512)
322-
def _compile_pattern(pat, sep, case_sensitive, recursive=True):
323-
"""Compile given glob pattern to a re.Pattern object (observing case
324-
sensitivity)."""
325-
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
326-
regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
327-
return re.compile(regex, flags=flags).match
328-
329-
330-
class _Globber:
331-
"""Class providing shell-style pattern matching and globbing.
332-
"""
333-
334-
def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False):
335-
self.sep = sep
336-
self.case_sensitive = case_sensitive
337-
self.case_pedantic = case_pedantic
338-
self.recursive = recursive
339-
340-
# Low-level methods
341-
342-
lstat = staticmethod(os.lstat)
343-
scandir = staticmethod(os.scandir)
344-
parse_entry = operator.attrgetter('path')
345-
concat_path = operator.add
346-
347-
if os.name == 'nt':
348-
@staticmethod
349-
def add_slash(pathname):
350-
tail = os.path.splitroot(pathname)[2]
351-
if not tail or tail[-1] in '\\/':
352-
return pathname
353-
return f'{pathname}\\'
354-
else:
355-
@staticmethod
356-
def add_slash(pathname):
357-
if not pathname or pathname[-1] == '/':
358-
return pathname
359-
return f'{pathname}/'
360-
361-
# High-level methods
362-
363-
def compile(self, pat):
364-
return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)
365-
366-
def selector(self, parts):
367-
"""Returns a function that selects from a given path, walking and
368-
filtering according to the glob-style pattern parts in *parts*.
369-
"""
370-
if not parts:
371-
return self.select_exists
372-
part = parts.pop()
373-
if self.recursive and part == '**':
374-
selector = self.recursive_selector
375-
elif part in _special_parts:
376-
selector = self.special_selector
377-
elif not self.case_pedantic and magic_check.search(part) is None:
378-
selector = self.literal_selector
379-
else:
380-
selector = self.wildcard_selector
381-
return selector(part, parts)
382-
383-
def special_selector(self, part, parts):
384-
"""Returns a function that selects special children of the given path.
385-
"""
386-
select_next = self.selector(parts)
387-
388-
def select_special(path, exists=False):
389-
path = self.concat_path(self.add_slash(path), part)
390-
return select_next(path, exists)
391-
return select_special
392-
393-
def literal_selector(self, part, parts):
394-
"""Returns a function that selects a literal descendant of a path.
395-
"""
396-
397-
# Optimization: consume and join any subsequent literal parts here,
398-
# rather than leaving them for the next selector. This reduces the
399-
# number of string concatenation operations and calls to add_slash().
400-
while parts and magic_check.search(parts[-1]) is None:
401-
part += self.sep + parts.pop()
402-
403-
select_next = self.selector(parts)
404-
405-
def select_literal(path, exists=False):
406-
path = self.concat_path(self.add_slash(path), part)
407-
return select_next(path, exists=False)
408-
return select_literal
409-
410-
def wildcard_selector(self, part, parts):
411-
"""Returns a function that selects direct children of a given path,
412-
filtering by pattern.
413-
"""
414-
415-
match = None if part == '*' else self.compile(part)
416-
dir_only = bool(parts)
417-
if dir_only:
418-
select_next = self.selector(parts)
419-
420-
def select_wildcard(path, exists=False):
421-
try:
422-
# We must close the scandir() object before proceeding to
423-
# avoid exhausting file descriptors when globbing deep trees.
424-
with self.scandir(path) as scandir_it:
425-
entries = list(scandir_it)
426-
except OSError:
427-
pass
428-
else:
429-
for entry in entries:
430-
if match is None or match(entry.name):
431-
if dir_only:
432-
try:
433-
if not entry.is_dir():
434-
continue
435-
except OSError:
436-
continue
437-
entry_path = self.parse_entry(entry)
438-
if dir_only:
439-
yield from select_next(entry_path, exists=True)
440-
else:
441-
yield entry_path
442-
return select_wildcard
443-
444-
def recursive_selector(self, part, parts):
445-
"""Returns a function that selects a given path and all its children,
446-
recursively, filtering by pattern.
447-
"""
448-
# Optimization: consume following '**' parts, which have no effect.
449-
while parts and parts[-1] == '**':
450-
parts.pop()
451-
452-
# Optimization: consume and join any following non-special parts here,
453-
# rather than leaving them for the next selector. They're used to
454-
# build a regular expression, which we use to filter the results of
455-
# the recursive walk. As a result, non-special pattern segments
456-
# following a '**' wildcard don't require additional filesystem access
457-
# to expand.
458-
follow_symlinks = self.recursive is not _no_recurse_symlinks
459-
if follow_symlinks:
460-
while parts and parts[-1] not in _special_parts:
461-
part += self.sep + parts.pop()
462-
463-
match = None if part == '**' else self.compile(part)
464-
dir_only = bool(parts)
465-
select_next = self.selector(parts)
466-
467-
def select_recursive(path, exists=False):
468-
path = self.add_slash(path)
469-
match_pos = len(str(path))
470-
if match is None or match(str(path), match_pos):
471-
yield from select_next(path, exists)
472-
stack = [path]
473-
while stack:
474-
yield from select_recursive_step(stack, match_pos)
475-
476-
def select_recursive_step(stack, match_pos):
477-
path = stack.pop()
478-
try:
479-
# We must close the scandir() object before proceeding to
480-
# avoid exhausting file descriptors when globbing deep trees.
481-
with self.scandir(path) as scandir_it:
482-
entries = list(scandir_it)
483-
except OSError:
484-
pass
485-
else:
486-
for entry in entries:
487-
is_dir = False
488-
try:
489-
if entry.is_dir(follow_symlinks=follow_symlinks):
490-
is_dir = True
491-
except OSError:
492-
pass
493-
494-
if is_dir or not dir_only:
495-
entry_path = self.parse_entry(entry)
496-
if match is None or match(str(entry_path), match_pos):
497-
if dir_only:
498-
yield from select_next(entry_path, exists=True)
499-
else:
500-
# Optimization: directly yield the path if this is
501-
# last pattern part.
502-
yield entry_path
503-
if is_dir:
504-
stack.append(entry_path)
505-
506-
return select_recursive
507-
508-
def select_exists(self, path, exists=False):
509-
"""Yields the given path, if it exists.
510-
"""
511-
if exists:
512-
# Optimization: this path is already known to exist, e.g. because
513-
# it was returned from os.scandir(), so we skip calling lstat().
514-
yield path
515-
else:
516-
try:
517-
self.lstat(path)
518-
yield path
519-
except OSError:
520-
pass
521-
522-
@classmethod
523-
def walk(cls, root, top_down, on_error, follow_symlinks):
524-
"""Walk the directory tree from the given root, similar to os.walk().
525-
"""
526-
paths = [root]
527-
while paths:
528-
path = paths.pop()
529-
if isinstance(path, tuple):
530-
yield path
531-
continue
532-
try:
533-
with cls.scandir(path) as scandir_it:
534-
dirnames = []
535-
filenames = []
536-
if not top_down:
537-
paths.append((path, dirnames, filenames))
538-
for entry in scandir_it:
539-
name = entry.name
540-
try:
541-
if entry.is_dir(follow_symlinks=follow_symlinks):
542-
if not top_down:
543-
paths.append(cls.parse_entry(entry))
544-
dirnames.append(name)
545-
else:
546-
filenames.append(name)
547-
except OSError:
548-
filenames.append(name)
549-
except OSError as error:
550-
if on_error is not None:
551-
on_error(error)
552-
else:
553-
if top_down:
554-
yield path, dirnames, filenames
555-
if dirnames:
556-
prefix = cls.add_slash(path)
557-
paths += [cls.concat_path(prefix, d) for d in reversed(dirnames)]

Lib/pathlib/__init__.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
operating systems.
66
"""
77

8-
import glob
98
import io
109
import ntpath
1110
import operator
@@ -25,7 +24,7 @@
2524
except ImportError:
2625
grp = None
2726

28-
from . import _abc
27+
from . import _abc, _glob
2928

3029

3130
__all__ = [
@@ -113,7 +112,7 @@ class PurePath(_abc.PurePathBase):
113112
'_hash',
114113
)
115114
parser = os.path
116-
_globber = glob._Globber
115+
_globber = _glob.Globber
117116

118117
def __new__(cls, *args, **kwargs):
119118
"""Construct a PurePath from one or several strings and or existing

Lib/pathlib/_abc.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
"""
1313

1414
import functools
15-
import glob
1615
import operator
1716
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
1817
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO
1918

19+
from . import _glob
20+
2021
#
2122
# Internals
2223
#
@@ -43,7 +44,7 @@ def _is_case_sensitive(parser):
4344
return parser.normcase('Aa') == 'Aa'
4445

4546

46-
class Globber(glob._Globber):
47+
class Globber(_glob.Globber):
4748
lstat = operator.methodcaller('lstat')
4849
add_slash = operator.methodcaller('joinpath', '')
4950

@@ -692,7 +693,7 @@ def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
692693
# know the case sensitivity of the underlying filesystem, so we
693694
# must use scandir() for everything, including non-wildcard parts.
694695
case_pedantic = True
695-
recursive = True if recurse_symlinks else glob._no_recurse_symlinks
696+
recursive = True if recurse_symlinks else _glob.no_recurse_symlinks
696697
globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive)
697698
return globber.selector(parts)
698699

0 commit comments

Comments
 (0)