-
Notifications
You must be signed in to change notification settings - Fork 2
/
external_links.py
170 lines (153 loc) · 5.66 KB
/
external_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from queue import Queue, Empty
from re import compile, DOTALL
from threading import Thread, Event
from wikitools import wiki
from wikitools.page import Page
import requests
verbose = False
LANGS = ['ar', 'cs', 'da', 'de', 'es', 'fi', 'fr', 'hu', 'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'pt-br', 'ro', 'ru', 'sv', 'tr', 'zh-hans', 'zh-hant']
PAGESCRAPERS = 10
LINKCHECKERS = 50
# Shamelessly copied from the old external_links_analyse.
def return_link_regex(withoutBracketed=False, onlyBracketed=False):
"""Return a regex that matches external links."""
# RFC 2396 says that URLs may only contain certain characters.
# For this regex we also accept non-allowed characters, so that the bot
# will later show these links as broken ('Non-ASCII Characters in URL').
# Note: While allowing dots inside URLs, MediaWiki will regard
# dots at the end of the URL as not part of that URL.
# The same applies to comma, colon and some other characters.
notAtEnd = '\]\s\.:;,<>"\|)'
# So characters inside the URL can be anything except whitespace,
# closing squared brackets, quotation marks, greater than and less
# than, and the last character also can't be parenthesis or another
# character disallowed by MediaWiki.
notInside = '\]\s<>"'
# The first half of this regular expression is required because '' is
# not allowed inside links. For example, in this wiki text:
# ''Please see http://www.example.org.''
# .'' shouldn't be considered as part of the link.
regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
+ '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
+ ']*[^' + notAtEnd + '])'
if withoutBracketed:
regex = r'(?<!\[)' + regex
elif onlyBracketed:
regex = r'\[' + regex
return compile(regex)
# Also shamelessly copied from the old external_links_analyse.
def get_links(regex, text):
nestedTemplateR = compile(r'{{([^}]*?){{(.*?)}}(.*?)}}')
while nestedTemplateR.search(text):
text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text)
# Then blow up the templates with spaces so that the | and }} will not be regarded as part of the link:.
templateWithParamsR = compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}', DOTALL)
while templateWithParamsR.search(text):
text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
for m in regex.finditer(text):
yield m.group('url')
# End of stuff I shamelessly copied.
def pagescraper(w, page_q, done, link_q, links):
while True:
try:
page = page_q.get(True, 1)
except Empty:
if done.is_set():
return
else:
continue
content = page.get_wiki_text()
linkRegex = return_link_regex()
for url in get_links(linkRegex, content):
if url not in links:
links[url] = []
link_q.put(url)
if page.title not in links[url]:
links[url].append(page.title)
def linkchecker(link_q, done, linkData):
while True:
try:
link = link_q.get(True, 1)
except Empty:
if done.is_set():
return
else:
continue
try:
r = requests.get(link, timeout=20)
r.raise_for_status()
continue # No error
except requests.exceptions.ConnectionError:
linkData.append(('Not found', link))
except requests.exceptions.Timeout:
linkData.append(('Timeout', link))
except requests.exceptions.TooManyRedirects:
linkData.append(('Redirect loop', link))
except requests.exceptions.HTTPError as e:
code = e.response.status_code
if code in (301, 302, 303):
linkData.append(('Redirect Loop', link))
else:
linkData.append((e.response.reason, link))
if verbose:
print(f'Found an error for {link}')
def main(w):
threads = []
# Stage 0: Generate list of pages
if verbose:
print('Generating page list')
page_q, done = Queue(), Event()
for page in w.get_all_pages():
if page.lang != 'en':
continue
page_q.put(page)
done.set()
if verbose:
print('All pages generated, entering stage 1')
# Stage 1: All pages generated. Pagescrapers are allowed to exit if Page Queue is empty.
links = {}
link_q = Queue()
for _ in range(PAGESCRAPERS): # Number of threads
thread = Thread(target=pagescraper, args=(w, page_q, done, link_q, links))
threads.append(thread)
thread.start()
if verbose:
print('All pages scraped, entering stage 2')
# Stage 2: All pages scraped. Linkscrapers are allowed to exit if Link Queue is empty.
_linkData = []
for _ in range(LINKCHECKERS): # Number of threads
thread = Thread(target=linkchecker, args=(link_q, done, _linkData))
threads.append(thread)
thread.start()
if verbose:
print('Waiting for linkscrapers to finish')
for thread in threads:
thread.join()
if verbose:
print('Done scraping links, generating output')
output = '== Dead or incorrectly behaving links ==\n'
linkData = sorted(_linkData)
for error, link in linkData:
output += f'* {link} ({error})\n'
for page in sorted(links[link]):
output += f'** [[:{page}]]\n'
output += '== Suspicious links ==\n'
for link in links:
suspicious = False
for domain in ['wiki.tf2.com', 'wiki.teamfortress.com', 'wiki.tf', 'pastie', 'paste']:
if domain in link:
suspicious = True
break
if suspicious:
output += f'* {link}\n'
for page in sorted(links[link]):
output += f'** [[:{page}]]\n'
output = output.replace('tumblr', 'tumb1r') # Link blacklist
output = output.replace('amazon', 'amaz0n') # Link blacklist
return output
if __name__ == '__main__':
verbose = True
w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
with open('wiki_external_links.txt', 'w') as f:
f.write(main(w))
print(f'Article written to {f.name}')