-
Notifications
You must be signed in to change notification settings - Fork 0
/
scramp.py
148 lines (131 loc) · 4.64 KB
/
scramp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import requests
import re
import json
import threading
import pymongo
def flush_data(data):
if '肤' in data:
return '肤色'
if '黑' in data:
return '黑色'
if '紫' in data:
return '紫色'
if '粉' in data:
return '粉色'
if '蓝' in data:
return '蓝色'
if '白' in data:
return '白色'
if '灰' in data:
return '灰色'
if '槟' in data:
return '香槟色'
if '琥' in data:
return '琥珀色'
if '红' in data:
return '红色'
if '紫' in data:
return '紫色'
if 'A' in data:
return 'A'
if 'B' in data:
return 'B'
if 'C' in data:
return 'C'
if 'D' in data:
return 'D'
def find_product_id(key_word):
jd_url = 'https://search.jd.com/Search'
product_ids = []
# 爬前3页的商品
for key_word in range(1,3):
param = {'keyword': key_word, 'enc': 'utf-8', 'page': i}
response = requests.get(jd_url,params=param)
ids = re.findall('data-pid="(.*?)"', response.text, re.S)
product_ids += ids
return product_ids
"""获取评论内容"""
def get_comment_message(product_id):
urls = ['https://sclub.jd.com/comment/productPageComments.action?'
'callback=fetchJSON_comment98vv53282&' \
'productId={}' \
'&score=0&sortType=5&' \
'page={}' \
'&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(product_id, page) for page in range(1, 11)]
for url in urls:
response = requests.get(url)
html = response.text
html = html.replace('fetchJSON_comment98vv53282(', '').replace(');', '')
data = json.loads(html)
comments = data['comments']
t = threading.Thread(target=save_mongo, args=(comments,))
t.start()
def save_mongo(comments):
for comment in comments:
product_data = {}
product_data['product_color'] = flush_data(comment['productColor'])
product_data['product_size'] = flush_data(comment['productSize'])
product_data['comment_content'] = comment['content']
product_data['create_time'] = comment['creationTime']
# 插入mongo
# flush_data清洗数据的方法
product_db.insert(product_data)
lock = threading.Lock()
# 获取评论线程
def spider_jd(ids):
while ids:
# 加锁
lock.acquire()
# 取出第一个元素
id = ids[0]
# 将取出的元素从列表中删除,避免重复加载
del ids[0]
# 释放锁
lock.release()
# 获取评论内容
get_comment_message(id)
product_ids = find_product_id('胸罩')
for i in (1, 5):
# 增加一个获取评论的线程
t = threading.Thread(target=spider_jd, args=(product_ids,))
# 启动线程
t.start()
if __name__ =="__main__":
from pylab import *
client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
# jd数据库
db = client.jd
# product表,没有自动创建
product_db = db.product
# jd数据库
db = client.jd
# product表,没有自动创建
product_db = db.product
# 统计以下几个颜色
color_arr = ['肤色', '黑色', '紫色', '粉色', '蓝色', '白色', '灰色', '香槟色', '红色']
color_num_arr = []
for i in color_arr:
num = product_db.count({'product_color': i})
color_num_arr.append(num)
# 显示的颜色
color_arr = ['bisque', 'black', 'purple', 'pink', 'blue', 'white', 'gray', 'peru', 'red']
#labeldistance,文本的位置离远点有多远,1.1指1.1倍半径的位置
#autopct,圆里面的文本格式,%3.1f%%表示小数有三位,整数有一位的浮点数
#shadow,饼是否有阴影
#startangle,起始角度,0,表示从0开始逆时针转,为第一块。一般选择从90度开始比较好看
#pctdistance,百分比的text离圆心的距离
#patches, l_texts, p_texts,为了得到饼图的返回值,p_texts饼图内部文本的,l_texts饼图外label的文本
patches,l_text,p_text = plt.pie(sizes, labels=labels, colors=colors,
labeldistance=1.1, autopct='%3.1f%%', shadow=False,
startangle=90, pctdistance=0.6)
#改变文本的大小
#方法是把每一个text遍历。调用set_size方法设置它的属性
for t in l_text:
t.set_size=(30)
for t in p_text:
t.set_size=(20)
# 设置x,y轴刻度一致,这样饼图才能是圆的
plt.axis('equal')
plt.title("内衣颜色比例图", fontproperties="SimHei") #
plt.legend()
plt.show()