-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawlerSync.py
55 lines (42 loc) · 1.5 KB
/
crawlerSync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from crawlerBase import CrawlerBase
import time
import lib.functions as func
'''
Synchronous Crawler for www.lagou.com
The most in-efficient way
Avg.= 22.0 sec per 100 requests
'''
class CrawlerSync(CrawlerBase):
# record time cost
t0, t1, t2, t3, total_new = 0.0, 0.0, 0.0, 0.0, 0
# constructor
def __init__(self):
CrawlerBase.__init__(self)
''' add decorator to record time consumming for each part '''
@func.timerAccumulate('t1')
def fetchPageContent(self, post={}):
return CrawlerBase.fetchPageContent(self, post)
@func.timerAccumulate('t2')
def hasDuplicate(self, data):
return CrawlerBase.hasDuplicate(self, data)
@func.timerAccumulate('t3')
def addRecord(self, data):
return CrawlerBase.addRecord(self, data)
@func.timerAccumulate('t0')
def singleRequest(self, i):
return CrawlerBase.singleRequest(self, i)
# trigger
def fire(self):
try:
for i in range(1, 31):
self.total_new = self.total_new + self.singleRequest(i)
except Exception as e:
print('Error: ' + str(e))
func.logger('crawler', time.strftime(
'%Y-%m-%d %H:%M:%S ') + '[error] ' + str(e))
finally:
msg = '%s Time cost(Synchro):%.4f New item:%d Request:%.4f Select:%.4f Save:%.4f' % (
time.strftime('%Y-%m-%d %H:%M:%S'), self.t0, self.total_new, self.t1, self.t2, self.t3)
func.logger('crawler', msg)
a = CrawlerSync()
a.fire()