Fix: mmtv custom website;try more actor matching rules

northsea4 · Feb 1, 2024 · e38989d · e38989d
1 parent fc88133
commit e38989d
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 7 deletions.
diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py
@@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
     elif website == 'mgstage':
         json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number))
     elif website == '7mmtv':
-        json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language))
+        json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path))
     elif website == 'fc2':
         json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language))
     elif website == 'fc2hub':

diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py
@@ -9,6 +9,8 @@
 
 from models.base.number import is_uncensored
 from models.base.web import curl_html
+from models.config.config import config
+from models.crawlers.guochan import get_actor_list, get_lable_list
 
 urllib3.disable_warnings()  # yapf: disable
 
@@ -30,15 +32,53 @@ def get_title(html, web_number):
     return result[0].replace(web_number, '').strip() if result else ''
 
 
-def get_actor(html):
+def get_actor(html, title, file_path):
     actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()')
     actor = ''
     if actor_list:
         for each in actor_list:
             '''愛澄玲花,日高ゆりあ（青山ひより） 菜津子 32歳 デザイナー'''
             actor += re.sub(r'（.+）', '', each).split(' ')[0] + ','
+    else:
+        actor = get_some_info(title, file_path, info_type="actor")
     return actor.strip(',')
 
+def get_some_info(title, file_path, info_type, tag='', actor='', series=''):
+
+    all_info = title + file_path + tag + actor + series
+
+    # 未找到标签时，从各种信息里匹配
+    if info_type == "tag":
+        tag_list = []
+        all_tag = get_lable_list()
+        for each in all_tag:
+            if each in all_info:
+                tag_list.append(each)
+        new_tag_list = []
+        [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list]
+        return ','.join(new_tag_list)
+
+    # 未找到演员时，看热门演员是否在标题和各种信息里
+    if info_type == "actor":
+        actor_list = []
+        all_actor = get_actor_list()
+        for each in all_actor:
+            if each in all_info:
+                actor_list.append(each)
+        new_actor_list = []
+        [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list]
+        return ','.join(new_actor_list)
+
+    # 未找到系列时，从各种信息里匹配
+    if info_type == "series":
+        series_list = []
+        all_series = get_lable_list()
+        for each in all_series:
+            if each in all_info:
+                series_list.append(each)
+        new_series_list = []
+        [new_series_list.append(i) for i in series_list if i and i not in new_series_list]
+        return ','.join(new_series_list)
 
 def get_real_url(html, number):
     result = html.xpath('//figure[@class="video-preview"]/a')
@@ -134,7 +174,7 @@ def get_tag(html):
 
 
 def get_extrafanart(html):
-    result = html.xpath('//a[@class="screens-item fresco"]/@href')
+    result = html.xpath('//a[@class="lazyload screens-item fresco"]/@href')
     return result if result else ''
 
 
@@ -166,7 +206,7 @@ def get_number(html, number):
     return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number
 
 
-def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
+def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''):
     start_time = time.time()
     website_name = '7mmtv'
     req_web += '-> %s' % website_name
@@ -175,9 +215,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
     web_info = '\n       '
     log_info += ' \n    🌐 7mmtv'
     debug_info = ''
+    mmtv_url = 'https://www.7mmtv.sx'
+    if hasattr(config, '7mmtv_website'):
+        mmtv_url = getattr(config, '7mmtv_website')
     real_url = appoint_url
     # search_url = "https://bb9711.com/zh/searchform_search/all/index.html"
-    search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
+    # search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
+    search_url = f"{mmtv_url}/zh/searchform_search/all/index.html"
     mosaic = ''
 
     try:
@@ -186,7 +230,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
             if number.upper().startswith('FC2'):
                 search_keyword = re.findall(r'\d{3,}', number)[0]
 
-            search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search'
+            search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search'
             debug_info = f'搜索地址: {search_url} '
             log_info += web_info + debug_info
             result, response = curl_html(search_url)
@@ -220,7 +264,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
                 debug_info = '数据获取失败: 未获取到title！'
                 log_info += web_info + debug_info
                 raise Exception(debug_info)
-            actor = get_actor(html_info)
+            actor = get_actor(html_info, title, file_path)
             actor_photo = get_actor_photo(actor)
             cover_url = get_cover(html_content)
             outline, originalplot = get_outline(html_info)