-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmain.py
191 lines (159 loc) · 7.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
import requests
from bs4 import BeautifulSoup
import os
import re
import time
import base64
import hashlib
class GoogleSearcher:
def __init__(self, upload="upload",
download="download"):
super().__init__()
self.upload = upload # 上传的图片所在目录
self.download = download # 下载的文件
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
if not os.path.exists(self.download):
os.mkdir(self.download) # 储存下载的文件
if not os.path.exists(self.upload):
os.mkdir(self.upload) # 储存上传的文件
print("{}中无文件".format(self.upload))
def upload_img_get_html(self, file):
# 上传图片,并获取对应的html源码
option = webdriver.ChromeOptions()
option.add_argument("headless")
driver = webdriver.Chrome(options=option) # 此时将webdriver.exe 保存到python Script目录下
driver.get("https://images.wjbaike.site/imghp")
# 等待输入框右边的相机图片出现
condition_1 = expected_conditions.visibility_of_element_located(
(By.CLASS_NAME, "LM8x9c"))
WebDriverWait(driver, timeout=20, poll_frequency=0.5).until(condition_1)
# 出现之后点击该按钮
image_button = driver.find_element_by_class_name("LM8x9c")
image_button.send_keys(Keys.ENTER)
# 等待界面上出现upload an image
condition_2 = expected_conditions.visibility_of_element_located(
(By.ID, "qbug"))
WebDriverWait(driver, timeout=200, poll_frequency=0.5).until(
condition_2)
# 转化到 upload an image
upload = driver.find_element_by_xpath('//*[@id="qbug"]/div/a')
upload.send_keys(Keys.ENTER)
# 查找文件上传的input
condition_3 = expected_conditions.visibility_of_element_located(
(By.ID, 'qbfile'))
WebDriverWait(driver, timeout=100, poll_frequency=0.5).until(
condition_3)
input_ = driver.find_element_by_id('qbfile')
# 上传文件,此处由于图片的控件是个input,可以直接使用send_keys
input_.send_keys(file)
# 当转到另外一个页面的时候
condition_4 = expected_conditions.visibility_of_element_located((By.XPATH, '//*[@id="top_nav"]'))
WebDriverWait(driver, timeout=20, poll_frequency=0.5).until(condition_4)
# driver.implicitly_wait(20)
time.sleep(6) # 网络好一点的话可以调小一点, 目的是使网页加载完全以获取源代码
# current_url = driver.current_url
# return self.get_html(current_url)
# 可能本地网络不太好,使用下面的方法的时候比较容易出现错误
# print(driver.current_url)
# print(driver.page_source)
#
return driver.page_source
def get_html(self, url):
session = requests.session()
response = session.get(url, headers=self.header)
response.encoding = response.apparent_encoding
try:
response.raise_for_status()
except:
return None
else:
return response.text
def get_img_list(self):
# 需要查询的文件放在同目录的img下面,返回图片列表
cwd = os.getcwd()
img_folder = cwd + "\\{}\\".format(self.upload)
return [img_folder + i for i in os.listdir(img_folder)]
def download_img_via_url(self, url, filename):
# 下载图片
img_name = str(filename) + ".png"
img_name = self.process_filename(img_name)
if url.startswith("//"):
url = "http:" + url
response = requests.get(url, headers=self.header)
response.encoding = response.apparent_encoding
try:
response.raise_for_status()
except:
pass
else:
if not os.path.exists(img_name):
with open(img_name, 'wb', errors='ignore') as file:
file.write(response.content)
else:
with open(img_name, 'wb', errors='ignore') as file:
file.write(response.content)
@staticmethod
def download_img_via_base64(string, filename):
pattern = re.compile("data:image/(.*?);base64,(.*?$)", re.I | re.M)
data = re.findall(pattern, string)[0]
if not data[1].endswith("=="):
data_ = data[1] + '=='
else:
data_ = data[1]
img_data = base64.b64decode(data_)
img_name = str(filename) + "." + data[0]
with open(img_name, "wb") as file:
file.write(img_data)
def analyse(self, html, img_dir, data_text_name):
# 解析html, 下载网页中的图片和文本
soup = BeautifulSoup(html, "lxml")
# 查找图片, 页面中需要的图片都是base64 的加密形式
pattern = re.compile("<script.*?>.*?(data:image.*?)['|\"];.*?</script>",
re.I | re.M)
for i, s in enumerate(re.findall(pattern, html)):
self.download_img_via_base64(s, img_dir + '/' + str(i))
# 网页的文本信息
text = soup.find("div", id='search').get_text()
data_text_name = str(data_text_name) + ".txt"
data_text_name = self.process_filename(data_text_name)
with open(data_text_name, "w", errors='ignore') as file:
file.write(text)
@staticmethod
def process_filename(filename):
# 将文件名进行处理,尽量避免出现同名的文件
if os.path.exists(filename):
md = hashlib.md5()
md.update("就用这个加密吧".encode("utf-8"))
hexdigest = md.hexdigest()
basename, extend = os.path.splitext(filename)
filename = basename + "_" + hexdigest[-3:] + extend
return filename
else:
return filename
def run(self):
img_list = self.get_img_list() # 先获取所有需要上传的图片
if not img_list:
print(self.upload + " 中间没有图片内容")
return
for i, img in enumerate(img_list):
img_name = os.path.splitext(os.path.split(img)[1])[0] # 所要上传图片的名字
print("正在处理图片{}".format(img_name))
# 在对应的目录下创建新的目录来储存对应获取的内容
this_download_dir = self.download + "/" + img_name
if not os.path.exists(this_download_dir):
os.mkdir(this_download_dir)
html_name = self.process_filename("{}.html".format(this_download_dir + "/" + img_name))
html_source = self.upload_img_get_html(img) # 获取上传图片之后获取的html source
with open(html_name, 'w', encoding='utf-8', errors='ignore') as file:
file.write(html_source)
self.analyse(html_source, this_download_dir, this_download_dir + "/" + img_name) # 解析网页,下载图片,写入网页文本
print("图片{}处理完成\n".format(img_name))
if __name__ == "__main__":
test = GoogleSearcher()
test.run()