From e38989dce66e8b05b75b82ee04b2b7a7b63d03bb Mon Sep 17 00:00:00 2001 From: runoob Date: Sun, 14 Jan 2024 21:47:00 +0800 Subject: [PATCH] Fix: mmtv custom website;try more actor matching rules --- src/models/core/crawler.py | 2 +- src/models/crawlers/mmtv.py | 56 +++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py index 04a5360..76386b4 100644 --- a/src/models/core/crawler.py +++ b/src/models/core/crawler.py @@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai elif website == 'mgstage': json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number)) elif website == '7mmtv': - json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language)) + json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path)) elif website == 'fc2': json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language)) elif website == 'fc2hub': diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py index 0e7e143..f5cc6b1 100644 --- a/src/models/crawlers/mmtv.py +++ b/src/models/crawlers/mmtv.py @@ -9,6 +9,8 @@ from models.base.number import is_uncensored from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_actor_list, get_lable_list urllib3.disable_warnings() # yapf: disable @@ -30,15 +32,53 @@ def get_title(html, web_number): return result[0].replace(web_number, '').strip() if result else '' -def get_actor(html): +def get_actor(html, title, file_path): actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()') actor = '' if actor_list: for each in actor_list: '''愛澄玲花,日高ゆりあ(青山ひより) 菜津子 32歳 デザイナー''' actor += re.sub(r'(.+)', '', each).split(' ')[0] + ',' + else: + actor = get_some_info(title, file_path, info_type="actor") return actor.strip(',') +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info: + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info: + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info: + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) def get_real_url(html, number): result = html.xpath('//figure[@class="video-preview"]/a') @@ -134,7 +174,7 @@ def get_tag(html): def get_extrafanart(html): - result = html.xpath('//a[@class="screens-item fresco"]/@href') + result = html.xpath('//a[@class="lazyload screens-item fresco"]/@href') return result if result else '' @@ -166,7 +206,7 @@ def get_number(html, number): return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number -def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''): start_time = time.time() website_name = '7mmtv' req_web += '-> %s' % website_name @@ -175,9 +215,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): web_info = '\n ' log_info += ' \n 🌐 7mmtv' debug_info = '' + mmtv_url = 'https://www.7mmtv.sx' + if hasattr(config, '7mmtv_website'): + mmtv_url = getattr(config, '7mmtv_website') real_url = appoint_url # search_url = "https://bb9711.com/zh/searchform_search/all/index.html" - search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + # search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + search_url = f"{mmtv_url}/zh/searchform_search/all/index.html" mosaic = '' try: @@ -186,7 +230,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): if number.upper().startswith('FC2'): search_keyword = re.findall(r'\d{3,}', number)[0] - search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search' + search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search' debug_info = f'搜索地址: {search_url} ' log_info += web_info + debug_info result, response = curl_html(search_url) @@ -220,7 +264,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): debug_info = '数据获取失败: 未获取到title!' log_info += web_info + debug_info raise Exception(debug_info) - actor = get_actor(html_info) + actor = get_actor(html_info, title, file_path) actor_photo = get_actor_photo(actor) cover_url = get_cover(html_content) outline, originalplot = get_outline(html_info)