-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
363 lines (295 loc) · 11.4 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
from bottle import Bottle, view, request, redirect
from wtforms import Form, StringField, IntegerField, BooleanField, validators
import urllib.request
from model import Base, Page, Relation
import urllib.request
import urllib.parse
import urllib.robotparser
from bs4 import BeautifulSoup
from queue import Queue
import threading
import time
import gzip
crawler_app = Bottle()
from model import session, engine
class CrawlerFormProcessor(Form):
url = StringField('URL', [validators.URL(require_tld=False, message="Must be valid URL")], default="http://",
render_kw={"placeholder": "https://example.com"})
depth = IntegerField('Max depth', [validators.NumberRange(min=1, message="Must be > 0")], default=3)
threads = IntegerField('Threads', [validators.NumberRange(min=1, message="Must be > 0")], default=16)
max_pages = IntegerField('Maximum pages', [validators.NumberRange(min=0, message="Must be 0 or positive")], default=500)
uel = BooleanField('Include external links')
db_lock = threading.Lock()
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def fix_non_ascii (wb):
# fix website link
url = wb
url = urllib.parse.urlsplit(url)
url = list(url)
for i in range(1, 5):
url[i] = urllib.parse.quote(url[i])
wb = urllib.parse.urlunsplit(url)
return wb
def add_page_with_text_to_database(page, text):
with db_lock:
try:
q = session.query(Page).filter(Page.url == page).scalar()
if q is not None:
q.text = text
session.commit()
except:
#ignore error
raise
def add_page_pair_to_database(from_page, to_page, limit):
with db_lock:
cou = session.query(Page.id).filter(Page.url == from_page).scalar()
cou1 = session.query(Page.id).filter(Page.url == to_page).scalar()
if cou is None:
new_page_from = Page(url=from_page, text="", rank=0)
session.add(new_page_from)
session.flush()
id0 = new_page_from.id
else:
id0 = cou
if cou1 is None:
allowed = limit < 1 or limit > session.query(Page).count()
if not allowed:
return
new_page_to = Page(url=to_page, text="", rank=0)
session.add(new_page_to)
session.flush()
id1 = new_page_to.id
else:
id1 = cou1
new_relation = Relation(page_id = id0, destination_id = id1)
# print(new_relation.page_id.id)
session.add(new_relation)
session.commit()
# print('Added to "relation" db: ', i.id, i1.id)
class Crawler:
def __init__(self, website, depth=3, pages_limit=0, threads_number=16, remove_external_links=True):
# settings
self.website = self.make_requestable_link(website)
if not is_ascii(self.website):
self.website = fix_non_ascii(self.website)
self.depth = depth
self.pages_limit = pages_limit
self.threads_number = threads_number
self.remove_external_links = remove_external_links
self.base = self.make_base(self.website)
print("Crawler initialized!")
print("Website = ", self.website)
print("Depth = ", self.depth)
print("Pages_limit = ", self.pages_limit)
print("Threads_number = ", self.threads_number)
print("Base = ", self.base)
print("External removed = ", self.remove_external_links)
# threading
self.q = Queue()
self.processed_lock = threading.Lock()
self.pages_counter_lock = threading.Lock()
# processing
self.processed = set()
self.robot_parser = urllib.robotparser.RobotFileParser()
self.current_pages_processed = 1
# output
self.dictionary = {}
@classmethod
def make_requestable_link(cls, website):
# add 'http' to the link if needed
if website.find("http://") != 0 and website.find("https://") != 0:
website = "http://" + website
return website
@classmethod
def make_base(cls, website):
# domain base
if website.find("https") == 0:
temp_base = website[8:]
else:
temp_base = website[7:]
slash_pos = temp_base.find('/')
if slash_pos != -1:
temp_base = temp_base[:slash_pos]
temp_base = ".".join(temp_base.split(".")[-2:])
return temp_base
def get_outlinks(self, wb):
# init resulting set
results = set()
#fix link if needed
if not is_ascii(wb):
wb = fix_non_ascii(wb)
request = urllib.request.Request(
wb,
headers={
"Accept-Encoding": "gzip"
})
# get header and content
gzip_ = False
try:
with urllib.request.urlopen(request, timeout=15) as url:
info = url.info()
if info["Content-Encoding"] == "gzip":
gzip_ = True
except IOError as e:
print("Couldn't get info for url", wb, e)
return set()
# discard non-html
if info is None:
return set()
if info['Content-Type'].find("html") == -1:
print("Error : It's not an html page!", wb)
return set()
# get header and content
try:
with urllib.request.urlopen(request, timeout=15) as url:
if not gzip_:
page = url.read()
else:
page = gzip.decompress(url.read())
# print("Decompressed")
except IOError:
print("Couldn't open url", wb)
return set()
# prepare soup
soup = BeautifulSoup(page, "html.parser")
# http://stackoverflow.com/a/24618186
for script in soup(["script", "style"]):
script.extract() # rip it out
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
add_page_with_text_to_database(wb, text)
# prepare soup
# soup = BeautifulSoup(page, "html.parser")
for link in soup.find_all('a'):
temp = link.get('href')
# skip empty
if temp is None:
continue
if len(temp) == 0:
continue
if temp.isspace():
continue
if temp == "?":
continue
# fix relative links
temp = urllib.parse.urljoin(wb, temp)
# throw away anchors
if temp[0] == '#':
continue
# cut anchors from urls at the end
if temp.rfind('#') != -1:
temp = temp[:temp.rfind('#')]
# throwaway javascript: , mailto: and anything like them
if temp[:4] != "http":
continue
if self.remove_external_links:
base_pos = temp.find(self.base)
sl = temp[8:].find("/") + 8
# print("For", temp, "base_pos =", base_pos, "sl =", sl)
if base_pos == -1 or (sl != -1 and sl < base_pos):
continue
if temp[base_pos-1] != ".":
continue
# print("Adding", temp)
if not is_ascii(temp):
temp = fix_non_ascii(temp)
results.add(temp)
return results
def worker(self):
debug = True
while True:
# get task from queue
current = self.q.get()
# are we done yet?
if current is None:
break
current_depth = current[0]
current_url = current[1]
new_depth = current_depth + 1
# check if it has not been taken
with self.processed_lock:
if debug:
print(threading.current_thread().name, "requests", current_depth, current_url)
self.processed.add(current_url)
# should we go below that depth?
# if current_depth > self.depth:
# print("Break because of depth")
# break
# do the work
res = self.get_outlinks(current_url)
# add new links to the queue
if new_depth <= self.depth:
for i in res:
add_page_pair_to_database(current_url, i, self.pages_limit)
with self.processed_lock:
for item in res:
if self.robot_parser.can_fetch("*", item):
if item not in self.processed:
should_insert = True
for i in list(self.q.queue):
if item == i[1]:
should_insert = False
break
if should_insert and \
(self.current_pages_processed < self.pages_limit or self.pages_limit == 0):
self.q.put((new_depth, item))
self.current_pages_processed += 1
else:
print(threading.current_thread().name, "Restricted by robots.txt", item)
self.q.task_done()
print(threading.current_thread().name, "is done. Bye-bye")
def start_crawler(self):
start = time.time()
# read robots.txt
tmp = "http://" + self.base + "/robots.txt"
self.robot_parser.set_url(tmp)
self.robot_parser.read()
# put first link
self.q.put((0, self.website))
new_page = Page(url=self.website, text="", rank=0)
session.add(new_page)
session.commit()
threads = []
for x in range(self.threads_number):
t = threading.Thread(target=self.worker)
t.daemon = True
threads.append(t)
t.start()
# wait until the queue becomes empty
self.q.join()
# join threads
for i in range(self.threads_number):
self.q.put(None)
for t in threads:
t.join()
session.commit()
# empty the queue
self.q.queue.clear()
end = time.time()
print("With", self.threads_number, "threads elapsed : ", end - start)
print("Total number of pages processed :", self.current_pages_processed)
@crawler_app.get('/crawler')
@crawler_app.post('/crawler')
@view('crawler')
def crawler():
form = CrawlerFormProcessor(request.forms.decode())
if request.method == 'POST' and form.validate():
session.commit()
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
crawl = Crawler(website=form.url.data,
depth=form.depth.data,
pages_limit=form.max_pages.data,
threads_number=form.threads.data,
remove_external_links=not form.uel.data )
crawl.start_crawler()
session.commit()
print("Finish: " + form.url.data)
redirect("/pages")
return locals()