From 15a06ba4ec1e52ae574c21b0da205e1b06ec3ab2 Mon Sep 17 00:00:00 2001 From: sqzw-x Date: Tue, 20 Feb 2024 18:59:09 +0800 Subject: [PATCH] feat(web)!: del cloudscraper; bump curl-cffi to 0.6.0b9 --- requirements-mac.txt | 4 +-- requirements.txt | 4 +-- src/models/base/web.py | 76 +++++++++++++++++++++--------------------- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/requirements-mac.txt b/requirements-mac.txt index 70ba498..cd4523f 100644 --- a/requirements-mac.txt +++ b/requirements-mac.txt @@ -1,6 +1,6 @@ lxml==4.9.2 pyquery==1.4.1 -cloudscraper==1.2.71 +# cloudscraper==1.2.71 requests==2.24.0 beautifulsoup4==4.9.3 Pillow==9.4.0 @@ -14,5 +14,5 @@ opencv-contrib-python-headless==4.7.0.68 deepl-translate==1.2.0 ping3==4.0.4 oshash==0.1.1 -curl-cffi==0.5.10 +curl-cffi==0.6.0b9 AppKit \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fbe34f5..3a1e3a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ lxml==4.9.2 pyquery==1.4.1 -cloudscraper==1.2.71 +# cloudscraper==1.2.71 requests==2.24.0 beautifulsoup4==4.9.3 Pillow==9.4.0 @@ -13,4 +13,4 @@ opencv-contrib-python-headless==4.7.0.68 deepl-translate==1.2.0 ping3==4.0.4 oshash==0.1.1 -curl-cffi==0.5.10 \ No newline at end of file +curl-cffi==0.6.0b9 \ No newline at end of file diff --git a/src/models/base/web.py b/src/models/base/web.py index 855e915..9f57fab 100644 --- a/src/models/base/web.py +++ b/src/models/base/web.py @@ -9,7 +9,7 @@ from threading import Lock from urllib.parse import quote -import cloudscraper +# import cloudscraper import curl_cffi.requests import requests import urllib3.util.connection as urllib3_cn @@ -45,8 +45,8 @@ def __init__(self): self.session_g = requests.Session() self.session_g.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)) self.session_g.mount('http://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)) - self.scraper = cloudscraper.create_scraper( - browser={'browser': 'firefox', 'platform': 'windows', 'mobile': False}) # returns a CloudScraper instance + # self.scraper = cloudscraper.create_scraper( + # browser={'browser': 'firefox', 'platform': 'windows', 'mobile': False}) # returns a CloudScraper instance self.lock = Lock() self.pool = ThreadPoolExecutor(32) self.curl_session = curl_cffi.requests.Session() @@ -157,39 +157,39 @@ def post_html(self, url: str, data=None, json=None, headers=None, cookies=None, signal.add_log(f"🔴 请求失败!{error_info}") return False, error_info - def scraper_html(self, url: str, proxies=True, cookies=None, headers=None): - # 获取代理信息 - is_docker = config.is_docker - timeout = config.timeout - retry_times = config.retry - if is_docker: - return self.get_html(url, proxies=proxies, cookies=cookies) - if proxies: - proxies = config.proxies - else: - proxies = { - "http": None, - "https": None, - } - - signal.add_log(f'🔎 Scraper请求 {url}') - for i in range(retry_times): - try: - with self.scraper.get(url, headers=headers, proxies=proxies, cookies=cookies, timeout=timeout) as f: - response = f - - if response.status_code > 299: - error_info = f"{response.status_code} {url} {str(f.cookies).replace('', '')}" - return False, error_info - else: - signal.add_log(f'✅ Scraper成功 {url}') - response.encoding = 'utf-8' - return True, f.text - except Exception as e: - error_info = '%s\nError: %s' % (url, e) - signal.add_log('🔴 重试 [%s/%s] %s' % (i + 1, retry_times, error_info)) - signal.add_log(f"🔴 请求失败!{error_info}") - return False, error_info + # def scraper_html(self, url: str, proxies=True, cookies=None, headers=None): + # # 获取代理信息 + # is_docker = config.is_docker + # timeout = config.timeout + # retry_times = config.retry + # if is_docker: + # return self.get_html(url, proxies=proxies, cookies=cookies) + # if proxies: + # proxies = config.proxies + # else: + # proxies = { + # "http": None, + # "https": None, + # } + # + # signal.add_log(f'🔎 Scraper请求 {url}') + # for i in range(retry_times): + # try: + # with self.scraper.get(url, headers=headers, proxies=proxies, cookies=cookies, timeout=timeout) as f: + # response = f + # + # if response.status_code > 299: + # error_info = f"{response.status_code} {url} {str(f.cookies).replace('', '')}" + # return False, error_info + # else: + # signal.add_log(f'✅ Scraper成功 {url}') + # response.encoding = 'utf-8' + # return True, f.text + # except Exception as e: + # error_info = '%s\nError: %s' % (url, e) + # signal.add_log('🔴 重试 [%s/%s] %s' % (i + 1, retry_times, error_info)) + # signal.add_log(f"🔴 请求失败!{error_info}") + # return False, error_info def _get_filesize(self, url): proxies = config.proxies @@ -312,7 +312,7 @@ def curl_html(self, url, headers=None, proxies=True, cookies=None): for i in range(int(retry_times)): try: response = self.curl_session.get(url_encode(url), headers=headers, cookies=cookies, proxies=proxies, - impersonate="edge99") + impersonate="chrome120") if 'amazon' in url: response.encoding = 'Shift_JIS' else: @@ -334,7 +334,7 @@ def curl_html(self, url, headers=None, proxies=True, cookies=None): web = WebRequests() get_html = web.get_html post_html = web.post_html -scraper_html = web.scraper_html +scraper_html = web.curl_html multi_download = web.multi_download curl_html = web.curl_html