-
Notifications
You must be signed in to change notification settings - Fork 51
/
SpiderScript.py
118 lines (115 loc) · 4.89 KB
/
SpiderScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- encoding: utf-8 -*-
from enum import Flag
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
import numpy as np
import requests
import json
import csv
import io
# 保存评论数据
def commentSave(list_comment):
'''
list_comment: 二维list,包含了多条用户评论信息
'''
file = io.open('data/JDComment_data.csv','w',encoding="utf-8",newline = '')
writer = csv.writer(file)
writer.writerow(['用户ID','评论内容','购买时间','点赞数','回复数','得分','评价时间','手机型号'])
for i in range(len(list_comment)):
writer.writerow(list_comment[i])
file.close()
print('存入成功')
def getCommentData(format_url,proc,i,maxPage):
'''
format_url: 格式化的字符串架子,在循环中给它添上参数
proc: 商品的productID,标识唯一的商品号
i: 商品的排序方式,例如全部商品、晒图、追评、好评等
maxPage: 商品的评论最大页数
'''
sig_comment = []
global list_comment
cur_page = 0
while cur_page < maxPage:
cur_page += 1
# url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv%s&score=%s&sortType=5&page=%s&pageSize=10&isShadowSku=0&fold=1'%(proc,i,cur_page)
url = format_url.format(proc,i,cur_page) # 给字符串添上参数
try:
response = requests.get(url=url, headers=headers, verify=False)
time.sleep(np.random.rand()*2)
jsonData = response.text
startLoc = jsonData.find('{')
#print(jsonData[::-1])//字符串逆序
jsonData = jsonData[startLoc:-2]
jsonData = json.loads(jsonData)
pageLen = len(jsonData['comments'])
print("当前第%s页"%cur_page)
for j in range(0,pageLen):
userId = jsonData['comments'][j]['id']#用户ID
content = jsonData['comments'][j]['content']#评论内容
boughtTime = jsonData['comments'][j]['referenceTime']#购买时间
voteCount = jsonData['comments'][j]['usefulVoteCount']#点赞数
replyCount = jsonData['comments'][j]['replyCount']#回复数目
starStep = jsonData['comments'][j]['score']#得分
creationTime = jsonData['comments'][j]['creationTime']#评价时间
referenceName = jsonData['comments'][j]['referenceName']#手机型号
sig_comment.append(userId)#每一行数据
sig_comment.append(content)
sig_comment.append(boughtTime)
sig_comment.append(voteCount)
sig_comment.append(replyCount)
sig_comment.append(starStep)
sig_comment.append(creationTime)
sig_comment.append(referenceName)
list_comment.append(sig_comment)
print(sig_comment)
sig_comment = []
except:
time.sleep(5)
cur_page -= 1
print('网络故障或者是网页出现了问题,五秒后重新连接')
if __name__ == "__main__":
global list_comment
ua=UserAgent(cache=True)
format_url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&{0}&score={1}&sortType=5&page={2}&pageSize=10&isShadowSku=0&fold=1'
# 设置访问请求头
headers = {
'Accept': '*/*',
'Host':"club.jd.com",
"User-Agent":ua.random,
'sec-ch-ua':"\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Google Chrome\";v=\"92\"",
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode':'no-cors',
'Sec-Fetch-Site':'same-site',
}
#手机四种颜色对应的产品id参数
productid = ['productId=100006795590','136061&productId=5089275','22778&productId=5475612','7021&productId=6784504']
list_comment = [[]]
sig_comment = []
for proc in productid:#遍历产品颜色
i = -1
while i < 7:#遍历排序方式
i += 1
if(i == 6):
continue
#先访问第0页获取最大页数,再进行循环遍历
url = format_url.format(proc,i,0)
print(url)
try:
response = requests.get(url=url, headers=headers, verify=False)
jsonData = response.text
print(jsonData)
startLoc = jsonData.find('{')
jsonData = jsonData[startLoc:-2]
jsonData = json.loads(jsonData)
print("最大页数%s"%jsonData['maxPage'])
getCommentData(format_url,proc,i,jsonData['maxPage'])#遍历每一页
except Exception as e:
i -= 1
print("the error is ",e)
print("wating---")
time.sleep(5)
#commentSave(list_comment)
print("爬取结束,开始存储-------")
commentSave(list_comment)