-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhengmei_pc.py
212 lines (193 loc) · 6.25 KB
/
zhengmei_pc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# -*- coding:UTF-8 -*-
import urllib.request
import os
import re
page_url = 'http://www.336699.com/'
#创建url打开函数
def url_open(url):
headers = {'User-Agent':'Mozilla/5.0 3578.98 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req, timeout=200.0)
html = response.read()
return html
#查找页面上的栏目,生成栏目地址列表
def find_cate(page_url):
print('开始获取栏目……')
html = url_open(page_url).decode('UTF-8')
#print(html)
Cates = []
#a = html.find('?</span><a href="')
a = html.find('<ul class="nav-list fl">')
#print(a)
a1 = html.find('</ul>')
b = html[a:a1]
#print(b)
b1 = b.find('<a href="')
while b1 != -1:
b2 = b.find('/">', b1)
if b2 != -1:
print('获取到栏目地址--> %s' % b[b1+9:b2+1])
Cates.append(b[b1+9:b2+1])
else:
b2 = b1 + 50
b1 = b.find('<a href="', b2)
#print(Cates)
#正妹秀是视频,移除该栏目
#Cates.remove('http://www.zhengmei.co/show/')
print('发现 %d 个栏目' % len(Cates))
#返回栏目地址列表
return Cates
#获取所有的详情页,生成列表
def find_details(cate):
print('开始获取详情页地址……')
html = url_open(cate).decode('utf-8')
a = html.find('<div class="page-show"><a href="')
a1 = html.find('">首页', a)
page1 = html[a+32:a1]
b = html.find('下一页</a><a href="')
b1 = html.find('.html', b)
c = html[b:b1]
page_all = c.split('_')[1]
#获取当前栏目的所有页面
Cate_pages = []
for num in range(1, int(page_all)+1):
if num < 2:
Cate_pages.append(page1)
else:
Cate_pages.append(page1[:-5] + '_' + str(num) + '.html')
#print(Cate_pages)
#获取所有的详情页
detail_pages = []
for cate_page in Cate_pages:
html = url_open(cate_page).decode('utf-8')
a = html.find('张</span><a href="')
while a != -1:
a1 = html.find('" targe', a)
if a1 != -1:
print('获取到详情页地址--> %s' % html[a+17:a1])
detail_pages.append(html[a+17:a1])
else:
a1 = a + 100
a = html.find('张</span><a href="', a1)
#print(detail_pages)
print('获取到 %d 个地址' % len(detail_pages))
return detail_pages
'''
#转化为H5链接
print('开始转换链接地址……')
detail_h5 = []
for i in detail_pages:
page_id = i.split('/')[-1].split('.')[0]
page_h5 = 'http://m.336699.com/n.php?id=' + page_id
detail_h5.append(page_h5)
print('地址转换完成!')
return detail_h5
'''
#获取详情页图片地址
def find_img(page_pc):
html = url_open(page_pc).decode('utf-8')
#图片地址列表
images = []
#采集分页
print('开始采集分页')
pages = []
a = html.find('下一页')
a1 = html.find('尾页', a)
page_num = html[a:a1].split('_')[1].split('.')[0]
for num in range(1, int(page_num)+1):
if num < 2:
pages.append(page_pc)
else:
pages.append(page_pc[:-5] + '_' + str(num) + '.html')
print('采集到 %d 个分页' % len(pages))
#采集每页的图片,每页单图不需要循环
print('开始获取图片地址……')
for page in pages:
html = url_open(page).decode('utf-8')
b = html.find('<div class="big-pic">')
b1 = html.find('</div>', b)
c = html[b:b1]
c1 = c.find('src="')
c2 = c.find('" border=', c1)
d = c[c1+5:c2]
images.append(d)
print('采集到 %d 张图片' % len(images))
return images
'''
#pc地址转换成h5地址后的图片获取
def find_img(page_h5):
print('开始获取图片地址……')
html = url_open(page_h5).decode('utf-8')
#图片地址列表
images = []
a = html.find('<!-- .p-content 为内容区域 -->')
a1 = html.find('<div class="prompt">', a)
b = html[a:a1]
b1 = b.find('src="')
while b1 != -1:
b2 = b.find('" border=', b1)
if b2 != -1:
images.append(b[b1+5:b2])
print('采集到图片地址--> %s' % b[b1+5:b2] )
else:
b2 = b1 + 144
b1 = b.find('src="', b2)
#print(images)
return images
'''
def save_img(folder, img_src):
print('开始生成图片……')
for img in img_src:
img_name = img.split('/')[-1]
print('正在生成图片--> %s' % img_name)
with open(img_name, 'wb') as f:
#读取超时则跳过
try:
img_content = url_open(img)
except Exception as e:
continue
f.write(img_content)
#获取文件夹命名
def folder_name(url):
html = url_open(url).decode('utf-8')
a = html.find('<title>')
a1 = html.find('_', a)
name = html[a+7:a1]
#去除文件夹名字中的非法字符
folder_dis = ['\\', '/', '|', ':', '?', '"', '“', '”', '*', '<', '>']
for dis in folder_dis:
while dis in name:
name = name.replace(dis, '')
print('开始生成图片文件夹 %s' % name)
return name
#开始下载
def Downloader(folder='正妹公社'):
print('下载开始……')
os.mkdir(folder)
os.chdir(folder)
#开始获取栏目地址,返回栏目列表
Cates = find_cate(page_url)
#创建栏目文件夹
for cate in Cates:
#获取当前栏目的移动端详情页,返回列表
Details = find_details(cate)
#获取栏目名字,创建文件夹
Channel_folder = folder_name(cate)
os.mkdir(Channel_folder)
os.chdir(Channel_folder)
#创建详情页文件夹
for detail in Details:
Detail_folder = folder_name(detail)
os.mkdir(Detail_folder)
os.chdir(Detail_folder)
#获取图片地址
Images = find_img(detail)
#写入图片
save_img(folder, Images)
#返回上层目录
os.chdir(os.pardir)
#返回主目录
os.chdir(os.pardir)
print('下载完成,开始欣赏吧!')
if __name__ == '__main__':
Downloader()