-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcatch.py
74 lines (67 loc) · 2.89 KB
/
catch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import cv2 as cv
import json
import random
import bs4
import os
import requests
# 内容提取链接
root = 'https://wiki.52poke.com'
url = 'https://wiki.52poke.com/wiki/%E5%AE%9D%E5%8F%AF%E6%A2%A6%E5%88%97%E8%A1%A8%EF%BC%88%E6%8C%89%E5%85%A8%E5%9B%BD%E5%9B%BE%E9%89%B4%E7%BC%96%E5%8F%B7%EF%BC%89/%E7%AE%80%E5%8D%95%E7%89%88'
# 构造头部
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
user_agent = random.choice(ua_list)
headers = {'User-Agent': user_agent}
def getPokedex(url):
pokedex = []
with requests.get(url=url, headers=headers) as response:
data = response.content.decode()
soup = bs4.BeautifulSoup(data,'lxml')
title_list = soup.select('.mw-redirect')
for t in title_list[1:]:
if t.string[0].encode('utf-8').isalpha():
content_link = t.get('href')
sub_url = root + content_link
pokedex.append(sub_url)
return pokedex
def getMainPage(urls):
pokelist = {}
for i in urls:
print(i, flush=True)
with requests.get(url=i, headers=headers) as response:
data = response.content.decode()
soup = bs4.BeautifulSoup(data,'lxml')
title_list = soup.select('.image[href]')
for a in title_list:
if i.split('/')[-1] in a.find('img').get('alt'):
pokelist[a.get('href').split(':')[-1]]=root+a.get('href')
print(a.get('href').split(':')[-1], root+a.get('href'), flush=True)
with open('Pokedex.json', 'w') as f:
json.dump(pokelist, f)
return pokelist
def getDetailContent(url):
with requests.get(url=url, headers=headers) as response:
data = response.content.decode()
soup = bs4.BeautifulSoup(data, 'lxml')
content = soup.select('.fullImageLink>a')
for i in content:
print(i.get('href'), flush=True)
img_url = i.get('href')
with requests.get(url='https:'+img_url, headers=headers) as r:
if 'Dream' in img_url:
open('/Users/videopls/Desktop/Gan-Pokemon/collections_dream/'+img_url.split('/')[-1], 'wb').write(r.content)
else:
open('/Users/videopls/Desktop/Gan-Pokemon/collections/'+img_url.split('/')[-1], 'wb').write(r.content)
return
def mainFunc(t_list):
for t in t_list:
content = getDetailContent(t_list[t])
if __name__ == '__main__':
with open('Pokedex.json', 'r') as f:
urls = json.load(f)
mainFunc(getMainPage(urls))