Merge PR sqzw-x#83

* Fix: write custom actor names to nfo file * Fix: nfo文件写入错误空格；手动设置演员写入nfo文件 * Fix: mmtv custom website;try more actor matching rules * Fix: not match unknown actor * Fix: madouqu custom website;more actor matching attempts * Fix: non-javdb site writes irrelevant searchid field * Chore: guochan data collation * Fix: 7mmtv get more extrafanart pics * Fix: madouqu subtle parameter adjustment * Fix: guochan crawlers remove useless characters; number recognition a… * Feat: add hscangku and cableav crawlers * Merge branch 'sqzw-x:master' into master * Fix: PR compliance revisions * Merge pull request sqzw-x#73 from kikiyou18/master * Opt: javdbid 输出逻辑
northsea4 · Feb 7, 2024 · 6ebe99e · 6ebe99e
1 parent a046c69
commit 6ebe99e
Show file tree

Hide file tree

Showing 12 changed files with 614 additions and 243 deletions.
diff --git a/src/controllers/main_window/main_window.py b/src/controllers/main_window/main_window.py
@@ -2061,6 +2061,8 @@ def _netResult(self):
                         'mdtv': ['https://www.mdpjzip.xyz', ''],
                         'madouqu': ['https://madouqu.com', ''],
                         'cnmdb': ['https://cnmdb.net', ''],
+                        'hscangku': ['https://hscangku.net', ''],
+                        'cableav': ['https://cableav.tv', ''],
                         'lulubar': ['https://lulubar.co', ''],
                         'love6': ['https://love6.tv', ''],
                         'yesjav': ['http://www.yesjav.info', ''],

diff --git a/src/models/config/config_manual.py b/src/models/config/config_manual.py
@@ -67,6 +67,8 @@ class ManualConfig:
         'lulubar',
         'madouqu',
         'mdtv',
+        'hscangku',
+        'cableav',
         'mgstage',
         'mywife',
         'prestige',
@@ -513,6 +515,8 @@ class ManualConfig:
         'mdtv': 'mdtv',
         'mdpjzip': 'mdtv',
         'madouqu': 'madouqu',
+        'hsck': 'hscangku',
+        'cableav': 'cableav',
         'mgstage': 'mgstage',
         '7mmtv': '7mmtv',
         'bb9711': '7mmtv',

diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py
@@ -11,7 +11,7 @@
 from models.core.flags import Flags
 from models.crawlers import airav_cc_new, airav_new, avsex, avsox, cnmdb, dahlia, dmm, faleno, fantastica, fc2, fc2club, \
     fc2hub, freejavbt, getchu, getchu_dmm, giga, hdouban, iqqtv_new, jav321, javbus, javdb, javlibrary_new, kin8, love6, \
-    lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity
+    lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity, hscangku, cableav
 from models.entity.enums import FileMode
 
 
@@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
     elif website == 'mgstage':
         json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number))
     elif website == '7mmtv':
-        json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language))
+        json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path))
     elif website == 'fc2':
         json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language))
     elif website == 'fc2hub':
@@ -137,6 +137,12 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
     elif website == 'madouqu':
         json_data = json.loads(
             madouqu.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number))
+    elif website == 'hscangku':
+        json_data = json.loads(
+            hscangku.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number))
+    elif website == 'cableav':
+        json_data = json.loads(
+            cableav.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) 
     elif website == 'getchu':
         json_data = json.loads(getchu.main(file_number, appoint_url, log_info, req_web, language))
     elif website == 'getchu_dmm':

diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py
@@ -81,8 +81,10 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
     nfo_title = config.naming_media
     if not number:
         number = title
+    # 默认emby视频标题配置为 [number title]，国产重复时需去掉一个，去重需注意空格也应一起去掉，否则国产的nfo标题中会多一个空格
+    # 读取nfo title信息会去掉前面的number和空格以保留title展示出来，同时number和标题一致时，去掉number的逻辑变成去掉整个标题导致读取失败，见426行
     if number == title and 'number' in nfo_title and 'title' in nfo_title:
-        nfo_title = nfo_title.replace('originaltitle', '').replace('title', '')
+        nfo_title = nfo_title.replace('originaltitle', '').replace('title', '').strip()
     first_letter = get_number_first_letter(number)
 
     # 处理演员
@@ -106,7 +108,7 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
         if not os.path.exists(folder_new_path):
             os.makedirs(folder_new_path)
         delete_file(nfo_new_path)  # 避免115出现重复文件
-        with open(nfo_new_path, "wt", encoding='UTF-8') as code:
+        with (open(nfo_new_path, "wt", encoding='UTF-8') as code):
             print('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', file=code)
             print("<movie>", file=code)
 
@@ -205,18 +207,27 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
             if 'country,' in nfo_include_new:
                 print(f"  <countrycode>{country}</countrycode>", file=code)
 
-            # 输出演员
+            # 初始化 actor_list
+            actor_list = []
+            # 输出男女演员
             if 'actor_all,' in nfo_include_new:
                 actor = all_actor
-            if actor and actor != '未知演员' and actor != '未知演員' and 'actor,' in nfo_include_new:
+            # 有演员时输出演员
+            if 'actor,' in nfo_include_new and actor:
                 actor_list = actor.split(',')  # 字符串转列表
                 actor_list = [actor.strip() for actor in actor_list if actor.strip()]  # 去除空白
-                if actor_list:
-                    for each in actor_list:
-                        print("  <actor>", file=code)
-                        print("    <name>" + each + "</name>", file=code)
-                        print("    <type>Actor</type>", file=code)
-                        print("  </actor>", file=code)
+            # 无演员时输出演员 以文件命名设置中未知演员设置项为演员名，默认设置和空值不写入NFO
+            elif 'actor,' in nfo_include_new and config.actor_no_name not in ["未知演员", '未知演員', '']:
+                actor = config.actor_no_name
+                actor_list = actor.split(',')  # 字符串转列表
+                actor_list = [actor.strip() for actor in actor_list if actor.strip()]  # 去除空白
+                signal.add_log(f'⛑️ 无演员名, 使用手动命名 写入NFO {config.actor_no_name}')
+            if actor_list:
+                for each in actor_list:
+                    print("  <actor>", file=code)
+                    print("    <name>" + each + "</name>", file=code)
+                    print("    <type>Actor</type>", file=code)
+                    print("  </actor>", file=code)
 
             # 输出导演
             if director and 'director,' in nfo_include_new:
@@ -318,10 +329,11 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
                 print("  <website>" + website + "</website>", file=code)
 
             # javdb id 输出, 没有时使用番号搜索页
-            if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']:
-                print("  <javdbid>" + json_data_nfo["javdbid"] + "</javdbid>", file=code)
-            else:
-                print("  <javdbsearchid>" + number + "</javdbsearchid>", file=code)
+            if "国产" not in json_data_nfo['mosaic'] and "國產" not in json_data_nfo['mosaic']:
+                if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']:
+                    print("  <javdbid>" + json_data_nfo["javdbid"] + "</javdbid>", file=code)
+                else:
+                    print("  <javdbsearchid>" + number + "</javdbsearchid>", file=code)
             print("</movie>", file=code)
             json_data['logs'] += "\n 🍀 Nfo done! (new)(%ss)" % get_used_time(start_time)
             return True

diff --git a/src/models/crawlers/cableav.py b/src/models/crawlers/cableav.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import re
+import time
+
+import urllib3
+import zhconv
+from lxml import etree
+
+from models.base.web import curl_html
+from models.config.config import config
+from models.crawlers.guochan import get_extra_info, get_number_list
+
+urllib3.disable_warnings()  # yapf: disable
+
+
+# import traceback
+
+def get_actor_photo(actor):
+    actor = actor.split(',')
+    data = {}
+    for i in actor:
+        actor_photo = {i: ''}
+        data.update(actor_photo)
+    return data
+
+
+def get_detail_info(html, number, file_path):
+    title_h1 = html.xpath('//div[@class="entry-content "]/p/text()')
+    title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number
+    actor = get_extra_info(title, file_path, info_type="actor")
+    tmp_tag = html.xpath('//header//div[@class="categories-wrap"]/a/text()')
+    # 标签转简体
+    tag = zhconv.convert(tmp_tag[0], 'zh-cn') if tmp_tag else ''
+    cover_url = html.xpath(f'//meta[@property="og:image"]/@content')
+    cover_url = cover_url[0] if cover_url else ''
+
+    return number, title, actor, cover_url, tag
+
+
+def get_real_url(html, number_list):
+    item_list = html.xpath('//h3[contains(@class,"title")]//a[@href and @title]')
+    for each in item_list:
+        # href="https://cableav.tv/Xq1Sg3SvZPk/"
+        detail_url = each.get('href')
+        title = each.xpath('text()')[0]
+        if title and detail_url:
+            for n in number_list:
+                temp_n = re.sub(r'[\W_]', '', n).upper()
+                temp_title = re.sub(r'[\W_]', '', title).upper()
+                if temp_n in temp_title:
+                    return True, n, title, detail_url
+    return False, '', '', ''
+
+
+def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''):
+    start_time = time.time()
+    website_name = 'cableav'
+    req_web += '-> %s' % website_name
+    title = ''
+    cover_url = ''
+    web_info = '\n       '
+    log_info += ' \n    🌐 cableav'
+    debug_info = ''
+    real_url = appoint_url
+    cableav_url = getattr(config, 'cableav_website', 'https://cableav.tv')
+
+    try:
+        if not real_url:
+            # 处理番号
+            number_list, filename_list = get_number_list(number, appoint_number, file_path)
+            n_list = number_list[:1] + filename_list
+            for each in n_list:
+                real_url = f'{cableav_url}/?s={each}'
+                # real_url = 'https://cableav.tv/s?s=%E6%9F%9A%E5%AD%90%E7%8C%AB'
+                debug_info = f'请求地址: {real_url} '
+                log_info += web_info + debug_info
+                result, response = curl_html(real_url)
+                if not result:
+                    debug_info = '网络请求错误: %s' % response
+                    log_info += web_info + debug_info
+                    raise Exception(debug_info)
+                search_page = etree.fromstring(response, etree.HTMLParser())
+                result, number, title, real_url = get_real_url(search_page, n_list)
+                # real_url = 'https://cableav.tv/hyfaqwfjhio'
+                if result:
+                    break
+            else:
+                debug_info = '没有匹配的搜索结果'
+                log_info += web_info + debug_info
+                raise Exception(debug_info)
+
+        debug_info = f'番号地址: {real_url} '
+        log_info += web_info + debug_info
+        result, response = curl_html(real_url)
+
+        if not result:
+            debug_info = '没有找到数据 %s ' % response
+            log_info += web_info + debug_info
+            raise Exception(debug_info)
+
+        detail_page = etree.fromstring(response, etree.HTMLParser())
+        number, title, actor, cover_url, tag = get_detail_info(detail_page, number, file_path)
+        actor_photo = get_actor_photo(actor)
+
+        try:
+            dic = {
+                'number': number,
+                'title': title,
+                'originaltitle': title,
+                'actor': actor,
+                'outline': '',
+                'originalplot': '',
+                'tag': tag,
+                'release': '',
+                'year': '',
+                'runtime': '',
+                'score': '',
+                'series': '',
+                'country': 'CN',
+                'director': '',
+                'studio': '',
+                'publisher': '',
+                'source': 'cableav',
+                'website': real_url,
+                'actor_photo': actor_photo,
+                'cover': cover_url,
+                'poster': '',
+                'extrafanart': '',
+                'trailer': '',
+                'image_download': False,
+                'image_cut': 'no',
+                'log_info': log_info,
+                'error_info': '',
+                'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+                'mosaic': '国产',
+                'wanted': '',
+            }
+            debug_info = '数据获取成功！'
+            log_info += web_info + debug_info
+            dic['log_info'] = log_info
+        except Exception as e:
+            debug_info = '数据生成出错: %s' % str(e)
+            log_info += web_info + debug_info
+            raise Exception(debug_info)
+
+    except Exception as e:
+        # print(traceback.format_exc())
+        debug_info = str(e)
+        dic = {
+            'title': '',
+            'cover': '',
+            'website': '',
+            'log_info': log_info,
+            'error_info': debug_info,
+            'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+        }
+    dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}}
+    js = json.dumps(
+        dic,
+        ensure_ascii=False,
+        sort_keys=False,
+        indent=4,
+        separators=(',', ': '),
+    )
+    return js
+
+
+if __name__ == '__main__':
+    # yapf: disable
+    # print(main('SSN010'))
+    # print(main('國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露', file_path='國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露'))
+    # print(main('國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜', file_path='國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜'))
+    # print(main('韓國高端攝影頂 Yeha 私拍福利', file_path='韓國高端攝影頂 Yeha 私拍福利'))
+    print(main('EMTC-005', file_path='國產AV 愛神傳媒 EMTC005 怒操高冷社長秘書 米歐'))