@@ -78,33 +78,12 @@ def _is_case_sensitive(flavour):
78
78
}
79
79
80
80
81
- @functools .lru_cache ()
82
- def _make_selector (pattern_parts , flavour , case_sensitive ):
83
- pat = pattern_parts [0 ]
84
- if not pat :
85
- return _TerminatingSelector ()
86
- if pat == '**' :
87
- child_parts_idx = 1
88
- while child_parts_idx < len (pattern_parts ) and pattern_parts [child_parts_idx ] == '**' :
89
- child_parts_idx += 1
90
- child_parts = pattern_parts [child_parts_idx :]
91
- if '**' in child_parts :
92
- cls = _DoubleRecursiveWildcardSelector
93
- else :
94
- cls = _RecursiveWildcardSelector
95
- else :
96
- child_parts = pattern_parts [1 :]
97
- if pat == '..' :
98
- cls = _ParentSelector
99
- elif '**' in pat :
100
- raise ValueError ("Invalid pattern: '**' can only be an entire path component" )
101
- else :
102
- cls = _WildcardSelector
103
- return cls (pat , child_parts , flavour , case_sensitive )
104
-
105
-
106
81
@functools .lru_cache (maxsize = 256 )
107
82
def _compile_pattern (pat , case_sensitive ):
83
+ """Compile given glob pattern to a re.Pattern object (observing case
84
+ sensitivity), or None if the pattern should match everything."""
85
+ if pat == '*' :
86
+ return None
108
87
flags = re .NOFLAG if case_sensitive else re .IGNORECASE
109
88
return re .compile (fnmatch .translate (pat ), flags ).match
110
89
@@ -127,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
127
106
# Match the start of the path, or just after a path separator
128
107
parts = ['^' ]
129
108
for part in pattern_lines .splitlines (keepends = True ):
130
- if part == '**\n ' :
109
+ if part == '*\n ' :
110
+ part = r'.+\n'
111
+ elif part == '*' :
112
+ part = r'.+'
113
+ elif part == '**\n ' :
131
114
# '**/' component: we use '[\s\S]' rather than '.' so that path
132
115
# separators (i.e. newlines) are matched. The trailing '^' ensures
133
116
# we terminate after a path separator (i.e. on a new line).
@@ -154,114 +137,70 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
154
137
return re .compile ('' .join (parts ), flags = flags )
155
138
156
139
157
- class _Selector :
158
- """A selector matches a specific glob pattern part against the children
159
- of a given path."""
160
-
161
- def __init__ (self , child_parts , flavour , case_sensitive ):
162
- self .child_parts = child_parts
163
- if child_parts :
164
- self .successor = _make_selector (child_parts , flavour , case_sensitive )
165
- self .dironly = True
166
- else :
167
- self .successor = _TerminatingSelector ()
168
- self .dironly = False
169
-
170
- def select_from (self , parent_path , follow_symlinks ):
171
- """Iterate over all child paths of `parent_path` matched by this
172
- selector. This can contain parent_path itself."""
173
- path_cls = type (parent_path )
174
- scandir = path_cls ._scandir
175
- if not parent_path .is_dir ():
176
- return iter ([])
177
- return self ._select_from (parent_path , scandir , follow_symlinks )
178
-
179
-
180
- class _TerminatingSelector :
181
-
182
- def _select_from (self , parent_path , scandir , follow_symlinks ):
183
- yield parent_path
184
-
185
-
186
- class _ParentSelector (_Selector ):
187
-
188
- def __init__ (self , name , child_parts , flavour , case_sensitive ):
189
- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
190
-
191
- def _select_from (self , parent_path , scandir , follow_symlinks ):
192
- path = parent_path ._make_child_relpath ('..' )
193
- for p in self .successor ._select_from (path , scandir , follow_symlinks ):
194
- yield p
195
-
196
-
197
- class _WildcardSelector (_Selector ):
198
-
199
- def __init__ (self , pat , child_parts , flavour , case_sensitive ):
200
- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
201
- if case_sensitive is None :
202
- # TODO: evaluate case-sensitivity of each directory in _select_from()
203
- case_sensitive = _is_case_sensitive (flavour )
204
- self .match = _compile_pattern (pat , case_sensitive )
205
-
206
- def _select_from (self , parent_path , scandir , follow_symlinks ):
207
- follow_dirlinks = True if follow_symlinks is None else follow_symlinks
140
+ def _select_children (parent_paths , dir_only , follow_symlinks , match ):
141
+ """Yield direct children of given paths, filtering by name and type."""
142
+ if follow_symlinks is None :
143
+ follow_symlinks = True
144
+ for parent_path in parent_paths :
208
145
try :
209
146
# We must close the scandir() object before proceeding to
210
147
# avoid exhausting file descriptors when globbing deep trees.
211
- with scandir ( parent_path ) as scandir_it :
148
+ with parent_path . _scandir ( ) as scandir_it :
212
149
entries = list (scandir_it )
213
150
except OSError :
214
151
pass
215
152
else :
216
153
for entry in entries :
217
- if self . dironly :
154
+ if dir_only :
218
155
try :
219
- if not entry .is_dir (follow_symlinks = follow_dirlinks ):
156
+ if not entry .is_dir (follow_symlinks = follow_symlinks ):
220
157
continue
221
158
except OSError :
222
159
continue
223
160
name = entry .name
224
- if self .match (name ):
225
- path = parent_path ._make_child_relpath (name )
226
- for p in self .successor ._select_from (path , scandir , follow_symlinks ):
227
- yield p
228
-
161
+ if match is None or match (name ):
162
+ yield parent_path ._make_child_relpath (name )
229
163
230
- class _RecursiveWildcardSelector (_Selector ):
231
-
232
- def __init__ (self , pat , child_parts , flavour , case_sensitive ):
233
- _Selector .__init__ (self , child_parts , flavour , case_sensitive )
234
-
235
- def _iterate_directories (self , parent_path , follow_symlinks ):
236
- yield parent_path
237
- for dirpath , dirnames , _ in parent_path .walk (follow_symlinks = follow_symlinks ):
238
- for dirname in dirnames :
239
- yield dirpath ._make_child_relpath (dirname )
240
-
241
- def _select_from (self , parent_path , scandir , follow_symlinks ):
242
- follow_dirlinks = False if follow_symlinks is None else follow_symlinks
243
- successor_select = self .successor ._select_from
244
- for starting_point in self ._iterate_directories (parent_path , follow_dirlinks ):
245
- for p in successor_select (starting_point , scandir , follow_symlinks ):
246
- yield p
247
-
248
-
249
- class _DoubleRecursiveWildcardSelector (_RecursiveWildcardSelector ):
250
- """
251
- Like _RecursiveWildcardSelector, but also de-duplicates results from
252
- successive selectors. This is necessary if the pattern contains
253
- multiple non-adjacent '**' segments.
254
- """
255
164
256
- def _select_from (self , parent_path , scandir , follow_symlinks ):
257
- yielded = set ()
258
- try :
259
- for p in super ()._select_from (parent_path , scandir , follow_symlinks ):
260
- if p not in yielded :
261
- yield p
262
- yielded .add (p )
263
- finally :
264
- yielded .clear ()
165
+ def _select_recursive (parent_paths , dir_only , follow_symlinks ):
166
+ """Yield given paths and all their subdirectories, recursively."""
167
+ if follow_symlinks is None :
168
+ follow_symlinks = False
169
+ for parent_path in parent_paths :
170
+ paths = [parent_path ]
171
+ while paths :
172
+ path = paths .pop ()
173
+ yield path
174
+ try :
175
+ # We must close the scandir() object before proceeding to
176
+ # avoid exhausting file descriptors when globbing deep trees.
177
+ with path ._scandir () as scandir_it :
178
+ entries = list (scandir_it )
179
+ except OSError :
180
+ pass
181
+ else :
182
+ for entry in entries :
183
+ try :
184
+ if entry .is_dir (follow_symlinks = follow_symlinks ):
185
+ paths .append (path ._make_child_relpath (entry .name ))
186
+ continue
187
+ except OSError :
188
+ pass
189
+ if not dir_only :
190
+ yield path ._make_child_relpath (entry .name )
191
+
192
+
193
+ def _select_unique (paths ):
194
+ """Yields the given paths, filtering out duplicates."""
195
+ yielded = set ()
196
+ try :
197
+ for path in paths :
198
+ raw_path = path ._raw_path
199
+ if raw_path not in yielded :
200
+ yield path
201
+ yielded .add (raw_path )
202
+ finally :
203
+ yielded .clear ()
265
204
266
205
267
206
#
@@ -1056,51 +995,109 @@ def _scandir(self):
1056
995
return os .scandir (self )
1057
996
1058
997
def _make_child_relpath (self , name ):
998
+ sep = self ._flavour .sep
999
+ lines_name = name .replace ('\n ' , sep )
1000
+ lines_str = self ._lines
1059
1001
path_str = str (self )
1060
1002
tail = self ._tail
1061
1003
if tail :
1062
- path_str = f'{ path_str } { self ._flavour .sep } { name } '
1004
+ path_str = f'{ path_str } { sep } { name } '
1005
+ lines_str = f'{ lines_str } \n { lines_name } '
1063
1006
elif path_str != '.' :
1064
1007
path_str = f'{ path_str } { name } '
1008
+ lines_str = f'{ lines_str } { lines_name } '
1065
1009
else :
1066
1010
path_str = name
1011
+ lines_str = lines_name
1067
1012
path = self .with_segments (path_str )
1068
1013
path ._str = path_str
1069
1014
path ._drv = self .drive
1070
1015
path ._root = self .root
1071
1016
path ._tail_cached = tail + [name ]
1017
+ path ._lines_cached = lines_str
1072
1018
return path
1073
1019
1074
1020
def glob (self , pattern , * , case_sensitive = None , follow_symlinks = None ):
1075
1021
"""Iterate over this subtree and yield all existing files (of any
1076
1022
kind, including directories) matching the given relative pattern.
1077
1023
"""
1078
1024
sys .audit ("pathlib.Path.glob" , self , pattern )
1079
- if not pattern :
1080
- raise ValueError ("Unacceptable pattern: {!r}" .format (pattern ))
1081
- drv , root , pattern_parts = self ._parse_path (pattern )
1082
- if drv or root :
1083
- raise NotImplementedError ("Non-relative patterns are unsupported" )
1084
- if pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1085
- pattern_parts .append ('' )
1086
- selector = _make_selector (tuple (pattern_parts ), self ._flavour , case_sensitive )
1087
- for p in selector .select_from (self , follow_symlinks ):
1088
- yield p
1025
+ return self ._glob (pattern , case_sensitive , follow_symlinks )
1089
1026
1090
1027
def rglob (self , pattern , * , case_sensitive = None , follow_symlinks = None ):
1091
1028
"""Recursively yield all existing files (of any kind, including
1092
1029
directories) matching the given relative pattern, anywhere in
1093
1030
this subtree.
1094
1031
"""
1095
1032
sys .audit ("pathlib.Path.rglob" , self , pattern )
1096
- drv , root , pattern_parts = self ._parse_path (pattern )
1097
- if drv or root :
1033
+ return self ._glob (f'**/{ pattern } ' , case_sensitive , follow_symlinks )
1034
+
1035
+ def _glob (self , pattern , case_sensitive , follow_symlinks ):
1036
+ path_pattern = self .with_segments (pattern )
1037
+ if path_pattern .drive or path_pattern .root :
1098
1038
raise NotImplementedError ("Non-relative patterns are unsupported" )
1099
- if pattern and pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1039
+ elif not path_pattern ._tail :
1040
+ raise ValueError ("Unacceptable pattern: {!r}" .format (pattern ))
1041
+
1042
+ pattern_parts = list (path_pattern ._tail )
1043
+ if pattern [- 1 ] in (self ._flavour .sep , self ._flavour .altsep ):
1044
+ # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
1100
1045
pattern_parts .append ('' )
1101
- selector = _make_selector (("**" ,) + tuple (pattern_parts ), self ._flavour , case_sensitive )
1102
- for p in selector .select_from (self , follow_symlinks ):
1103
- yield p
1046
+ if pattern_parts [- 1 ] == '**' :
1047
+ # GH-70303: '**' only matches directories. Add trailing slash.
1048
+ pattern_parts .append ('' )
1049
+
1050
+ if case_sensitive is None :
1051
+ # TODO: evaluate case-sensitivity of each directory in _select_children().
1052
+ case_sensitive = _is_case_sensitive (self ._flavour )
1053
+
1054
+ # If symlinks are handled consistently, and the pattern does not
1055
+ # contain '..' components, then we can use a 'walk-and-match' strategy
1056
+ # when expanding '**' wildcards. When a '**' wildcard is encountered,
1057
+ # all following pattern parts are immediately consumed and used to
1058
+ # build a `re.Pattern` object. This pattern is used to filter the
1059
+ # recursive walk. As a result, pattern parts following a '**' wildcard
1060
+ # do not perform any filesystem access, which can be much faster!
1061
+ filter_paths = follow_symlinks is not None and '..' not in pattern_parts
1062
+ deduplicate_paths = False
1063
+ paths = iter ([self ] if self .is_dir () else [])
1064
+ part_idx = 0
1065
+ while part_idx < len (pattern_parts ):
1066
+ part = pattern_parts [part_idx ]
1067
+ part_idx += 1
1068
+ if part == '' :
1069
+ # Trailing slash.
1070
+ pass
1071
+ elif part == '..' :
1072
+ paths = (path ._make_child_relpath ('..' ) for path in paths )
1073
+ elif part == '**' :
1074
+ # Consume adjacent '**' components.
1075
+ while part_idx < len (pattern_parts ) and pattern_parts [part_idx ] == '**' :
1076
+ part_idx += 1
1077
+
1078
+ if filter_paths and part_idx < len (pattern_parts ) and pattern_parts [part_idx ] != '' :
1079
+ dir_only = pattern_parts [- 1 ] == ''
1080
+ paths = _select_recursive (paths , dir_only , follow_symlinks )
1081
+
1082
+ # Filter out paths that don't match pattern.
1083
+ prefix_len = len (self ._make_child_relpath ('_' )._lines ) - 1
1084
+ match = _compile_pattern_lines (path_pattern ._lines , case_sensitive ).match
1085
+ paths = (path for path in paths if match (path ._lines [prefix_len :]))
1086
+ return paths
1087
+
1088
+ dir_only = part_idx < len (pattern_parts )
1089
+ paths = _select_recursive (paths , dir_only , follow_symlinks )
1090
+ if deduplicate_paths :
1091
+ # De-duplicate if we've already seen a '**' component.
1092
+ paths = _select_unique (paths )
1093
+ deduplicate_paths = True
1094
+ elif '**' in part :
1095
+ raise ValueError ("Invalid pattern: '**' can only be an entire path component" )
1096
+ else :
1097
+ dir_only = part_idx < len (pattern_parts )
1098
+ match = _compile_pattern (part , case_sensitive )
1099
+ paths = _select_children (paths , dir_only , follow_symlinks , match )
1100
+ return paths
1104
1101
1105
1102
def walk (self , top_down = True , on_error = None , follow_symlinks = False ):
1106
1103
"""Walk the directory tree from this directory, similar to os.walk()."""
0 commit comments