forked from mherrmann/gitignore_parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gitignore_parser.py
220 lines (201 loc) · 7.41 KB
/
gitignore_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import collections
import os
import re
from os.path import abspath, dirname
from pathlib import Path
from typing import Reversible, Union
def handle_negation(file_path, rules: Reversible["IgnoreRule"]):
for rule in reversed(rules):
if rule.match(file_path):
return not rule.negation
return False
def parse_gitignore(full_path, base_dir=None):
if base_dir is None:
base_dir = dirname(full_path)
rules = []
with open(full_path) as ignore_file:
counter = 0
for line in ignore_file:
counter += 1
line = line.rstrip('\n')
rule = rule_from_pattern(line, base_path=Path(base_dir).resolve(),
source=(full_path, counter))
if rule:
rules.append(rule)
if not any(r.negation for r in rules):
return lambda file_path: any(r.match(file_path) for r in rules)
else:
# We have negation rules. We can't use a simple "any" to evaluate them.
# Later rules override earlier rules.
return lambda file_path: handle_negation(file_path, rules)
def rule_from_pattern(pattern, base_path=None, source=None):
"""
Take a .gitignore match pattern, such as "*.py[cod]" or "**/*.bak",
and return an IgnoreRule suitable for matching against files and
directories. Patterns which do not match files, such as comments
and blank lines, will return None.
Because git allows for nested .gitignore files, a base_path value
is required for correct behavior. The base path should be absolute.
"""
if base_path and base_path != Path(base_path).resolve():
raise ValueError('base_path must be absolute')
# Store the exact pattern for our repr and string functions
orig_pattern = pattern
# Early returns follow
# Discard comments and separators
if pattern.strip() == '' or pattern[0] == '#':
return
# Strip leading bang before examining double asterisks
if pattern[0] == '!':
negation = True
pattern = pattern[1:]
else:
negation = False
# Multi-asterisks not surrounded by slashes (or at the start/end) should
# be treated like single-asterisks.
pattern = re.sub(r'([^/])\*{2,}', r'\1*', pattern)
pattern = re.sub(r'\*{2,}([^/])', r'*\1', pattern)
# Special-casing '/', which doesn't match any files or directories
if pattern.rstrip() == '/':
return
directory_only = pattern[-1] == '/'
# A slash is a sign that we're tied to the base_path of our rule
# set.
anchored = '/' in pattern[:-1]
if pattern[0] == '/':
pattern = pattern[1:]
if pattern[0] == '*' and len(pattern) >= 2 and pattern[1] == '*':
pattern = pattern[2:]
anchored = False
if pattern[0] == '/':
pattern = pattern[1:]
if pattern[-1] == '/':
pattern = pattern[:-1]
# patterns with leading hashes or exclamation marks are escaped with a
# backslash in front, unescape it
if pattern[0] == '\\' and pattern[1] in ('#', '!'):
pattern = pattern[1:]
# trailing spaces are ignored unless they are escaped with a backslash
i = len(pattern)-1
striptrailingspaces = True
while i > 1 and pattern[i] == ' ':
if pattern[i-1] == '\\':
pattern = pattern[:i-1] + pattern[i:]
i = i - 1
striptrailingspaces = False
else:
if striptrailingspaces:
pattern = pattern[:i]
i = i - 1
regex = fnmatch_pathname_to_regex(
pattern, directory_only, negation, anchored=bool(anchored)
)
return IgnoreRule(
pattern=orig_pattern,
regex=regex,
negation=negation,
directory_only=directory_only,
anchored=anchored,
base_path=_normalize_path(base_path) if base_path else None,
source=source
)
IGNORE_RULE_FIELDS = [
'pattern', 'regex', # Basic values
'negation', 'directory_only', 'anchored', # Behavior flags
'base_path', # Meaningful for gitignore-style behavior
'source' # (file, line) tuple for reporting
]
class IgnoreRule(collections.namedtuple('IgnoreRule_', IGNORE_RULE_FIELDS)):
def __str__(self):
return self.pattern
def __repr__(self):
return ''.join(['IgnoreRule(\'', self.pattern, '\')'])
def match(self, abs_path: Union[str, Path]):
matched = False
if self.base_path:
rel_path = str(_normalize_path(abs_path).relative_to(self.base_path))
else:
rel_path = str(_normalize_path(abs_path))
# Path() strips the trailing slash, so we need to preserve it
# in case of directory-only negation
if self.negation and type(abs_path) == str and abs_path[-1] == '/':
rel_path += '/'
if rel_path.startswith('./'):
rel_path = rel_path[2:]
if re.search(self.regex, rel_path):
matched = True
return matched
# Frustratingly, python's fnmatch doesn't provide the FNM_PATHNAME
# option that .gitignore's behavior depends on.
def fnmatch_pathname_to_regex(
pattern, directory_only: bool, negation: bool, anchored: bool = False
):
"""
Implements fnmatch style-behavior, as though with FNM_PATHNAME flagged;
the path separator will not match shell-style '*' and '.' wildcards.
"""
i, n = 0, len(pattern)
seps = [re.escape(os.sep)]
if os.altsep is not None:
seps.append(re.escape(os.altsep))
seps_group = '[' + '|'.join(seps) + ']'
nonsep = r'[^{}]'.format('|'.join(seps))
res = []
while i < n:
c = pattern[i]
i += 1
if c == '*':
try:
if pattern[i] == '*':
i += 1
if i < n and pattern[i] == '/':
i += 1
res.append(''.join(['(.*', seps_group, ')?']))
else:
res.append('.*')
else:
res.append(''.join([nonsep, '*']))
except IndexError:
res.append(''.join([nonsep, '*']))
elif c == '?':
res.append(nonsep)
elif c == '/':
res.append(seps_group)
elif c == '[':
j = i
if j < n and pattern[j] == '!':
j += 1
if j < n and pattern[j] == ']':
j += 1
while j < n and pattern[j] != ']':
j += 1
if j >= n:
res.append('\\[')
else:
stuff = pattern[i:j].replace('\\', '\\\\').replace('/', '')
i = j + 1
if stuff[0] == '!':
stuff = ''.join(['^', stuff[1:]])
elif stuff[0] == '^':
stuff = ''.join('\\' + stuff)
res.append('[{}]'.format(stuff))
else:
res.append(re.escape(c))
if anchored:
res.insert(0, '^')
else:
res.insert(0, f"(^|{seps_group})")
if not directory_only:
res.append('$')
elif directory_only and negation:
res.append('/$')
else:
res.append('($|\\/)')
return ''.join(res)
def _normalize_path(path: Union[str, Path]) -> Path:
"""Normalize a path without resolving symlinks.
This is equivalent to `Path.resolve()` except that it does not resolve symlinks.
Note that this simplifies paths by removing double slashes, `..`, `.` etc. like
`Path.resolve()` does.
"""
return Path(abspath(path))