-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
182 lines (165 loc) · 6.17 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
#
# Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module provide configure file management service in i18n environment.
Authors: hk
Date: 2015/07/16 17:23:06
"""
import os
import re
import sys
import gzip
import time
import Queue
import getopt
import shutil
import urllib
import urllib2
import logging
import StringIO
import threading
import ConfigParser
import chardet
import BeautifulSoup
import clist
class Crawler(threading.Thread):
"""Crawl the urls with BFS
Attributes:
queue: The queue to reserve the urls that is waitint to be crawled.
ls: The list to reserve the urls that have been crawled.
ls1: The list
ls2: The list
output: The output filepath.
timeout: The biggest response time.
target: The target content to be saved.
"""
def __init__(self, ls, queue, output, crawl_timeout, target_url):
"""Inits ClassCrawl."""
threading.Thread.__init__(self)
self.queue=queue
self.ls=ls
self.output=output
self.timeout=crawl_timeout
self.target=target_url
def run(self):
"""Run a thread"""
if self.queue.empty(): #判断队列是否为空
print "the queue is empty"
time.sleep(20)
self.run()
self.chost=self.queue.get() #get()方法从队头删除并返回一个项目
if self.chost == "":
time.sleep(20)
self.run()
if self.chost == "none":
return 0
self.list_data(self.chost) #遍历页里的地址
self.run()
def list_data(self,url):
"""Use the lists to remove duplicates"""
#self.ls1.list_del(url)
if self.ls.list_query(url): #查询这个地址是否爬过
print "url is crawled",url
return 0
self.ls.list_add(url)
print "start crawl:",url
list_2=[]
list=self.geturl(url)
for i in list: #去除重复数据
if i not in list_2:
list_2.append(i)
self.queue.put(i)
def geturl(self,myurl):
"""Get the htmls and save the content we need"""
user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent': user_agent}
req=urllib2.Request(myurl, headers=headers)
try:
mypage=urllib2.urlopen(req,timeout=self.timeout)
except urllib2.HTTPError as e:
logging.exception(e)
return []
except urllib2.URLError as e:
logging.exception(e)
return []
if mypage.info().get('Content-Encoding') == 'gzip': #处理压缩网页格式
buf=StringIO.StringIO(mypage.read())
f=gzip.GzipFile(fileobj=buf)
pg=f.read()
else:
pg=mypage.read()
chardit1=chardet.detect(pg) #判断网页类型
print chardit1['encoding']
content=BeautifulSoup.BeautifulSoup(pg, fromEncoding=chardit1['encoding'])
content=content.findAll('img')
urls=[]
for item in content:
h=re.compile(r'src="([^"]+)"').search(str(item))
if h:
href=h.group(1)
if re.compile(r'http[s]?').search(href) and re.compile(self.target).search(href):
urls.append(href)
ans=urllib2.urlopen(href).read()
href=href.replace(':','_')
href=href.replace('/','_')
href=href.replace('?','_')
f=open(os.path.join(self.output,href),'wb')
f.write(ans)
f.write('\n')
f.close()
return urls
def start_thread(ls,thread_count,openurl, outputfile, crawl_timeout, target_url):
threads=[]
for i in range(thread_count):
threads.append(Crawler(ls, openurl, outputfile, crawl_timeout, target_url))
for t in threads:
t.start() #开始线程
for t in threads:
t.join()
def main(argv):
"""The main function to get the argvs"""
opts,args=getopt.getopt(argv[1:], "hvc:")
for op,value in opts:
if op == "-h":
print "help"
sys.exit()
elif op == "-v":
print "the version is 1.0..."
sys.exit()
elif op == "-c":
cf=ConfigParser.ConfigParser()
cf.read(value)
inputfile=cf.get("spider", "url_list_file") #输入路径
outputfile=cf.get("spider", "output_directory") #输出路径
max_depth=cf.getint("spider", "max_depth") #最大抓取深度
crawl_interval=cf.getint("spider", "crawl_interval") #抓取间隔
crawl_timeout=cf.getint("spider", "crawl_timeout") #抓取超时
target_url=cf.get("spider", "target_url") #目标网页
thread_count=cf.getint("spider", "thread_count") #抓取线程数
openurl=Queue.Queue(0)
ls=clist.List() #初始化类
urls=open(inputfile, 'r')
for url in urls:
openurl.put(url.strip('\n'))
urls.close()
if openurl.empty(): #判断队列是否为空
print "no seed links"
sys.exit()
depth=0
while depth <= max_depth: #控制抓取深度
print depth
for i in range(thread_count):
openurl.put("none")
start_thread(ls, thread_count, openurl, outputfile, crawl_timeout, target_url)
depth+=1
print "reach the max depth"
else:
print "unhandled option"
sys.exit()
if __name__ == '__main__':
main(sys.argv)