-
Notifications
You must be signed in to change notification settings - Fork 2
/
topictreespider.py
167 lines (153 loc) · 5.14 KB
/
topictreespider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#-*- coding: UTF-8 -*-
import urllib2
import re
from pyquery import PyQuery as pq
import config
import time
from getsubtopic import Gst
from writedown import Wd
import random
import sys
#根话题
# url = '/topic/19776749/organize/entire'
#可修改名称话题&&无子话题
#有子话题
# url = '/topic/19776751/organize/entire'
# 记录器
wd = Wd(config.tdfn,config.dfn)
# 获取子节点工具
gst = Gst()
global baseUrl
baseUrl = "https://www.zhihu.com"
# 抓取队列
global grabList
grabList = []
# 已抓取队列 用于判重
global haveList
haveList = {}
#获取页面内容
def get_html_value(url):
try:
request = urllib2.Request(url,headers=config.pageheaders)
response = urllib2.urlopen(request)
return response.read()
except:
return
# 获取关注人数
def get_topic_focus(htmlVlue):
result = re.findall(".*<strong>(\d*)</strong>.*",htmlVlue)
if(result):
return result[0]
# 获取话题名称
def get_topic_title(htmlVlue):
result = re.findall(".*<h1 class=\"zm-editable-content\"(\ ?.*)?>(.*)</h1>.*",htmlVlue)
if(result):
return result[0][1]
# 获取当前话题ID
def get_topic_id(url):
result = re.findall(".*/topic/(\d*)/organize.*",url)
if(result):
return result[0]
# 获取父话题
def get_parents_str(htmlValue):
d=pq(htmlValue)
if(d):
return d('.parent-topic')
# 获取父话题名称
def get_parents_title(parents_str):
_list = []
d = pq(parents_str)
if(d):
for i in range(0,len(d('a'))):
_list.append(d('a').eq(i).html().encode('latin1').decode('utf8').strip())
return _list
# 获取父话题ID
def get_parents_id(parents_str):
_list = []
d = pq(parents_str)
if(d):
for i in range(0,len(d('a'))):
_list.append(d('a').eq(i).attr('data-token'))
return _list
# 检查子话题数据
def check_subtopic_data(myjson,fatherid):
# 检查数据是否正常
if(isinstance(myjson, dict)):
# 子话题是否获取完毕
while(len(myjson['msg'][1])==11):
wd.wd_temporaryData(myjson)
url = 'https://www.zhihu.com/topic/'+fatherid+'/organize/entire?child='+myjson['msg'][1][10][0][2]+'&parent='+fatherid
for i in range(0, len(myjson['msg'][1])-1):
grabList.append(myjson['msg'][1][i][0][2])
print '待 ' + str(len(grabList))
sys.stdout.flush()
time.sleep(config.sleeptime+random.uniform(0,3))
myjson = gst.get_subtopic_json(url,config.subtopic_values,config.subtopic_headers)
# 子话题获取完毕后
wd.wd_temporaryData(myjson)
for i in range(0, len(myjson['msg'][1])):
grabList.append(myjson['msg'][1][i][0][2])
print '待 ' + str(len(grabList))
sys.stdout.flush()
return 'end'
else:
print "异常"
print myjson
sys.stdout.flush()
time.sleep(60+random.uniform(0,3))
return check_subtopic_data(myjson,fatherid)
# 检查抓取队列
def check_topiclist():
if len(grabList)!=0:
return 1
else:
return 0
# 获取一个话题数据
def getonetopic(url,topicid):
initurl = str('https://www.zhihu.com/topic/'+str(topicid)+'/organize/entire?child='+str(topicid)+'&parent='+str(topicid))
# 获取页面
html = get_html_value(url)
if(html!=None):
# 获取父节点数据
_parents_str = get_parents_str(html)
# 记录话题树数据
_title = get_topic_title(html)
_id = get_topic_id(url)
_focus = get_topic_focus(html)
_parentsname = get_parents_title(_parents_str)
_parentsid = get_parents_id(_parents_str)
if(isinstance(_parentsname, list)):
for i in range(0,len(_parentsname)):
wd.wd_Data(_title,_id,_focus,_parentsname[i],_parentsid[i])
else:
wd.wd_Data(_title,_id,_focus,_parentsname,_parentsid)
# 写入已抓取队列用于判重
_url = get_topic_id(url)
haveList[_url] = ''
# 程序入口
def starttopictreespider():
initID = 19776749
initurl = baseUrl + '/topic/19776749/organize/entire#anchor-children-topic'
# 获取话题数据
getonetopic(initurl,initID)
# 获取子话题数据
subtopic_json = gst.get_subtopic_json(initurl,config.subtopic_values,config.subtopic_headers)
# 检查子话题数据
check_subtopic_data(subtopic_json,initID)
# 检查抓取队列
while(check_topiclist()):
print '已 ' + str(len(haveList))
print '待 ' + str(len(grabList))
initurl = baseUrl + '/topic/'+grabList[0]+'/organize/entire#anchor-children-topic'
initID = grabList[0]
del grabList[0]
if((initID in haveList)==False):
sys.stdout.flush()
time.sleep(config.sleeptime+random.uniform(0,3))
getonetopic(initurl,initID)
# 获取子节点数据
subtopic_json = gst.get_subtopic_json(initurl,config.subtopic_values,config.subtopic_headers)
# 检查子节点数据
check_subtopic_data(subtopic_json,initID)
print '抓取结束'
starttopictreespider()