Skip to content

Commit

Permalink
feat(web)!: del cloudscraper; bump curl-cffi to 0.6.0b9
Browse files Browse the repository at this point in the history
  • Loading branch information
sqzw-x committed Feb 20, 2024
1 parent ca38e46 commit 15a06ba
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 42 deletions.
4 changes: 2 additions & 2 deletions requirements-mac.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
lxml==4.9.2
pyquery==1.4.1
cloudscraper==1.2.71
# cloudscraper==1.2.71
requests==2.24.0
beautifulsoup4==4.9.3
Pillow==9.4.0
Expand All @@ -14,5 +14,5 @@ opencv-contrib-python-headless==4.7.0.68
deepl-translate==1.2.0
ping3==4.0.4
oshash==0.1.1
curl-cffi==0.5.10
curl-cffi==0.6.0b9
AppKit
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
lxml==4.9.2
pyquery==1.4.1
cloudscraper==1.2.71
# cloudscraper==1.2.71
requests==2.24.0
beautifulsoup4==4.9.3
Pillow==9.4.0
Expand All @@ -13,4 +13,4 @@ opencv-contrib-python-headless==4.7.0.68
deepl-translate==1.2.0
ping3==4.0.4
oshash==0.1.1
curl-cffi==0.5.10
curl-cffi==0.6.0b9
76 changes: 38 additions & 38 deletions src/models/base/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from threading import Lock
from urllib.parse import quote

import cloudscraper
# import cloudscraper
import curl_cffi.requests
import requests
import urllib3.util.connection as urllib3_cn
Expand Down Expand Up @@ -45,8 +45,8 @@ def __init__(self):
self.session_g = requests.Session()
self.session_g.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100))
self.session_g.mount('http://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100))
self.scraper = cloudscraper.create_scraper(
browser={'browser': 'firefox', 'platform': 'windows', 'mobile': False}) # returns a CloudScraper instance
# self.scraper = cloudscraper.create_scraper(
# browser={'browser': 'firefox', 'platform': 'windows', 'mobile': False}) # returns a CloudScraper instance
self.lock = Lock()
self.pool = ThreadPoolExecutor(32)
self.curl_session = curl_cffi.requests.Session()
Expand Down Expand Up @@ -157,39 +157,39 @@ def post_html(self, url: str, data=None, json=None, headers=None, cookies=None,
signal.add_log(f"🔴 请求失败!{error_info}")
return False, error_info

def scraper_html(self, url: str, proxies=True, cookies=None, headers=None):
# 获取代理信息
is_docker = config.is_docker
timeout = config.timeout
retry_times = config.retry
if is_docker:
return self.get_html(url, proxies=proxies, cookies=cookies)
if proxies:
proxies = config.proxies
else:
proxies = {
"http": None,
"https": None,
}

signal.add_log(f'🔎 Scraper请求 {url}')
for i in range(retry_times):
try:
with self.scraper.get(url, headers=headers, proxies=proxies, cookies=cookies, timeout=timeout) as f:
response = f

if response.status_code > 299:
error_info = f"{response.status_code} {url} {str(f.cookies).replace('<RequestsCookieJar[', '').replace(']>', '')}"
return False, error_info
else:
signal.add_log(f'✅ Scraper成功 {url}')
response.encoding = 'utf-8'
return True, f.text
except Exception as e:
error_info = '%s\nError: %s' % (url, e)
signal.add_log('🔴 重试 [%s/%s] %s' % (i + 1, retry_times, error_info))
signal.add_log(f"🔴 请求失败!{error_info}")
return False, error_info
# def scraper_html(self, url: str, proxies=True, cookies=None, headers=None):
# # 获取代理信息
# is_docker = config.is_docker
# timeout = config.timeout
# retry_times = config.retry
# if is_docker:
# return self.get_html(url, proxies=proxies, cookies=cookies)
# if proxies:
# proxies = config.proxies
# else:
# proxies = {
# "http": None,
# "https": None,
# }
#
# signal.add_log(f'🔎 Scraper请求 {url}')
# for i in range(retry_times):
# try:
# with self.scraper.get(url, headers=headers, proxies=proxies, cookies=cookies, timeout=timeout) as f:
# response = f
#
# if response.status_code > 299:
# error_info = f"{response.status_code} {url} {str(f.cookies).replace('<RequestsCookieJar[', '').replace(']>', '')}"
# return False, error_info
# else:
# signal.add_log(f'✅ Scraper成功 {url}')
# response.encoding = 'utf-8'
# return True, f.text
# except Exception as e:
# error_info = '%s\nError: %s' % (url, e)
# signal.add_log('🔴 重试 [%s/%s] %s' % (i + 1, retry_times, error_info))
# signal.add_log(f"🔴 请求失败!{error_info}")
# return False, error_info

def _get_filesize(self, url):
proxies = config.proxies
Expand Down Expand Up @@ -312,7 +312,7 @@ def curl_html(self, url, headers=None, proxies=True, cookies=None):
for i in range(int(retry_times)):
try:
response = self.curl_session.get(url_encode(url), headers=headers, cookies=cookies, proxies=proxies,
impersonate="edge99")
impersonate="chrome120")
if 'amazon' in url:
response.encoding = 'Shift_JIS'
else:
Expand All @@ -334,7 +334,7 @@ def curl_html(self, url, headers=None, proxies=True, cookies=None):
web = WebRequests()
get_html = web.get_html
post_html = web.post_html
scraper_html = web.scraper_html
scraper_html = web.curl_html
multi_download = web.multi_download
curl_html = web.curl_html

Expand Down

0 comments on commit 15a06ba

Please sign in to comment.