-
Notifications
You must be signed in to change notification settings - Fork 8
/
weibo_s.py
81 lines (67 loc) · 2.91 KB
/
weibo_s.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
import requests
import json
import re
from lxml import etree
import logging
__author__ = 'lpe234'
class WeiBoSearchSpider(object):
weibo_s = 'http://s.weibo.com/weibo/'
request_headers = {
'Host': 's.weibo.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://s.weibo.com/',
'Connection': 'keep-alive',
}
def __init__(self, search_text):
self.search_text = search_text
self.search_href = self.weibo_s + self.search_text
logging.basicConfig(level=logging.DEBUG)
def do_request(self, search_href):
""" 获取url内容 """
resp = requests.get(search_href, headers=self.request_headers)
if resp.status_code != 200:
logging.error('request error: {}'.format(resp.status_code))
return None
return resp.content
def analyse_content(self, content):
""" 分析网页中的数据 """
html = etree.HTML(content)
# 微博搜索页使用JS加载内容
scripts_nodes = html.xpath('//script/text()')
weibo_text = None
for node in scripts_nodes:
if '"pid":"pl_weibo_direct"' in node:
weibo_text = node
break
# 这块有些啰嗦. 需要看网页html分析才可
if weibo_text:
weibo_text = re.findall(r'\((.*)\)', str(weibo_text))
if weibo_text:
weibo_data = json.loads(weibo_text[0])
weibo_html = weibo_data.get('html')
if weibo_html:
weibo_html_ = etree.HTML(weibo_html)
weibo_nodes = weibo_html_.xpath('//div[contains(@class, "WB_cardwrap")]')
for wnode in weibo_nodes:
mid = wnode.xpath('./div[@mid]/@mid')
author_title = wnode.xpath('.//div[@class="face"]/a/@title')
author_href = wnode.xpath('.//div[@class="face"]/a/@href')
author_avatar = wnode.xpath('.//img[@class="W_face_radius"]/@src')
content = wnode.xpath('.//p[@class="comment_txt"]//text()')
medias = wnode.xpath('.//img/@src')
print ''.join(mid), ''.join(author_title), ''.join(author_href), ''.join(author_avatar), \
''.join(content), ', '.join(medias)
def save_data(self):
""" 数据保存 """
pass
def run(self):
""" 进行实际获取数据 """
content = self.do_request(self.search_href)
self.analyse_content(content)
if __name__ == '__main__':
wbss = WeiBoSearchSpider('足球')
wbss.run()