-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
105 lines (93 loc) · 2.64 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
__author__ = 'Dimitri Zhang'
import urllib.request as urlreq
import re
from bs4 import BeautifulSoup
import os
import os.path
import threading
from queue import Queue
from multiprocessing.dummy import Pool
class Downloader(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
msg = self.queue.get()
if isinstance(msg, str) and msg == 'quit':
break
fdir, url = msg
print('downloading %s' %url)
with open(fdir, 'wb') as f:
f.write(urlreq.urlopen(url).read())
print('%s downloaded' %url)
self.queue.task_done()
def crawling(url):
homDir = '.'
if not os.path.exists(homDir):
os.mkdir(homDir)
stack = [url]
# reg_1 = re.compile(r'[A-Z0-9]+/')
# reg_2 = re.compile(r'[A-Z0-9]+\.[A-Z0-9]+')
downloaders = []
queue = Queue()
for i in range(8):
dl = Downloader(queue)
dl.start()
downloaders.append(dl)
while len(stack) != 0:# DFS
topUrl = stack.pop()
urltoken = topUrl.split('/')
fdir = homDir + '/' + topUrl[topUrl.find('Android%20Programming/'):]#dir in file system
if urltoken[-1] == '':#if path
if not os.path.exists(fdir):
os.mkdir(fdir)
response = urlreq.urlopen(topUrl)
soup = BeautifulSoup(response.read(), 'lxml')
for link in soup.find_all('a'):
href = link.get('href')
if not href.startswith('/~vkepuska') and not href.startswith('?'):
stack.append(topUrl + link.get('href'))
else:
queue.put((fdir, topUrl))
for i in range(8):
queue.put('quit')
for dl in downloaders:
dl.join()
def nextURL(url):
homDir = '.'
if not os.path.exists(homDir):
os.mkdir(homDir)
stack = [url]
while len(stack) != 0:
topUrl = stack.pop()
urltoken = topUrl.split('/')
fdir = homDir + '/' + topUrl[topUrl.find('Android%20Programming/'):]#dir in file system
if urltoken[-1] == '':#if path
if not os.path.exists(fdir):
os.mkdir(fdir)
response = urlreq.urlopen(topUrl)
soup = BeautifulSoup(response.read(), 'lxml')
for link in soup.find_all('a'):
href = link.get('href')
if not href.startswith('/~vkepuska') and not href.startswith('?'):
stack.append(topUrl + link.get('href'))
else:
print(fdir)
yield (fdir, topUrl)
def download(val):
fdir, url = val
print('downloading %s' %url)
with open(fdir, 'wb') as f:
f.write(urlreq.urlopen(url).read())
print('%s downloaded' %url)
def parallelCrawling(url):
threadPool = Pool(12)
threadPool.map(download, nextURL(url))
threadPool.close()
threadPool.join()
def main():
crawling('http://my.fit.edu/~vkepuska/Android%20Programming/')
# parallelCrawling('http://my.fit.edu/~vkepuska/Android%20Programming/')
if __name__ == '__main__':
main()