-
Notifications
You must be signed in to change notification settings - Fork 115
/
Copy pathcrawler.py
executable file
·162 lines (131 loc) · 4.92 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
# Built out from https://www.scrapingbee.com/blog/crawling-python/
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
logging.basicConfig(
format='%(asctime)s %(levelname)s:%(message)s',
level=logging.INFO)
class Crawler:
def __init__(self, urls=[]):
self.urls_to_visit = urls
self.visited_urls = []
self.nonhtml_urls = []
self.bad_urls = []
self.external_urls = []
self.base_url = "http://localhost/"
self.internal_equiv = [
"http://shallowsky.com",
"https://shallowsky.com",
"http://www.shallowsky.com",
"https://www.shallowsky.com",
]
def get_linked_urls(self, url, html):
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
path = link.get('href')
if path and path.startswith('/'):
path = urljoin(url, path)
yield path
# XXX Do the same for img URLs, but don't recurse on them
def is_internal(self, url):
if ':' not in url:
return True
if url.startswith(self.base_url):
return True
# for equiv in self.internal_equiv:
# if url.startswith(equiv):
# return True
return False
def map_equiv(self, url):
# If url starts with any of the equivalent domains,
# replace them with the base url.
for equiv in self.internal_equiv:
if url.startswith(equiv):
return self.base_url + url[len(equiv):]
return url
def add_url_to_visit(self, url):
if not self.is_internal(url):
if url not in self.external_urls:
self.external_urls.append(url)
return
# Strip off any named anchors
try:
url = url[:url.index('#')]
except ValueError:
pass
if url not in self.visited_urls and url not in self.urls_to_visit \
and url not in self.nonhtml_urls and url not in self.bad_urls:
# and url not in self.external_urls:
# print("Will visit", url)
self.urls_to_visit.append(url)
def crawl(self, url):
# Check MIME type; don't try to parse non-HTML files
head = requests.head(url)
if 'Content-Type' not in head.headers:
print(url, ": No Content-Type! headers:", head.headers)
self.nonhtml_urls.append(url)
return
if not head.headers['Content-Type'].startswith("text/html"):
self.nonhtml_urls.append(url)
return
html = requests.get(url).text
for suburl in self.get_linked_urls(url, html):
# print("linked url:", suburl)
# Make it absolute
suburl = urljoin(url, suburl)
# print("absolute:", suburl)
suburl = self.map_equiv(suburl)
# print("mapped:", suburl)
self.add_url_to_visit(suburl)
def check_external_link(self, url):
"""Check headers for an external link.
Return True if it's good, else False.
"""
logging.info(f'Checking external link: {url}')
try:
head = requests.head(url, timeout=10, allow_redirects=True)
except:
return False
return (head.status_code == 200)
def check_all_externals(self):
self.bad_externals = [
url for url in self.external_urls
if not self.check_external_link(url)
]
def run(self):
while self.urls_to_visit:
url = self.urls_to_visit.pop(0)
# logging.info(f'Crawling: {url}')
try:
self.crawl(url)
except Exception:
self.bad_urls.append(url)
logging.exception(f'Failed to crawl: {url}')
finally:
self.visited_urls.append(url)
if __name__ == '__main__':
crawler = Crawler(urls=['http://localhost/'])
try:
crawler.run()
# Check external links
crawler.check_all_externals()
except KeyboardInterrupt:
print("Interrupt")
with open("/tmp/urls-bad.txt", "w") as fp:
for url in crawler.bad_urls:
print(url, file=fp)
with open("/tmp/urls-internal.txt", "w") as fp:
for url in crawler.visited_urls:
print(url, file=fp)
print("\nNON-HTML FILES:\n", file=fp)
for url in crawler.nonhtml_urls:
print(url, file=fp)
with open("/tmp/urls-external-good.txt", "w") as goodfp:
with open("/tmp/urls-external-bad.txt", "w") as badfp:
for url in crawler.external_urls:
if url in crawler.bad_externals:
print(url, file=badfp)
else:
print(url, file=goodfp)