From 6ebe99ed7506f82dc91d07d0b9fd67eafc844158 Mon Sep 17 00:00:00 2001 From: sqzw-x Date: Wed, 7 Feb 2024 09:59:38 +0800 Subject: [PATCH] Merge PR #83 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix: write custom actor names to nfo file * Fix: nfo文件写入错误空格;手动设置演员写入nfo文件 * Fix: mmtv custom website;try more actor matching rules * Fix: not match unknown actor * Fix: madouqu custom website;more actor matching attempts * Fix: non-javdb site writes irrelevant searchid field * Chore: guochan data collation * Fix: 7mmtv get more extrafanart pics * Fix: madouqu subtle parameter adjustment * Fix: guochan crawlers remove useless characters; number recognition a… * Feat: add hscangku and cableav crawlers * Merge branch 'sqzw-x:master' into master * Fix: PR compliance revisions * Merge pull request #73 from kikiyou18/master * Opt: javdbid 输出逻辑 --- src/controllers/main_window/main_window.py | 2 + src/models/config/config_manual.py | 4 + src/models/core/crawler.py | 10 +- src/models/core/nfo.py | 40 ++- src/models/crawlers/cableav.py | 176 ++++++++++ src/models/crawlers/guochan.py | 368 +++++++++------------ src/models/crawlers/hscangku.py | 196 +++++++++++ src/models/crawlers/jav321.py | 12 +- src/models/crawlers/javbus.py | 2 +- src/models/crawlers/madouqu.py | 15 +- src/models/crawlers/mdtv.py | 2 + src/models/crawlers/mmtv.py | 30 +- 12 files changed, 614 insertions(+), 243 deletions(-) create mode 100644 src/models/crawlers/cableav.py create mode 100644 src/models/crawlers/hscangku.py diff --git a/src/controllers/main_window/main_window.py b/src/controllers/main_window/main_window.py index b471cf3..3b3676b 100644 --- a/src/controllers/main_window/main_window.py +++ b/src/controllers/main_window/main_window.py @@ -2061,6 +2061,8 @@ def _netResult(self): 'mdtv': ['https://www.mdpjzip.xyz', ''], 'madouqu': ['https://madouqu.com', ''], 'cnmdb': ['https://cnmdb.net', ''], + 'hscangku': ['https://hscangku.net', ''], + 'cableav': ['https://cableav.tv', ''], 'lulubar': ['https://lulubar.co', ''], 'love6': ['https://love6.tv', ''], 'yesjav': ['http://www.yesjav.info', ''], diff --git a/src/models/config/config_manual.py b/src/models/config/config_manual.py index e97a984..5c59e21 100644 --- a/src/models/config/config_manual.py +++ b/src/models/config/config_manual.py @@ -67,6 +67,8 @@ class ManualConfig: 'lulubar', 'madouqu', 'mdtv', + 'hscangku', + 'cableav', 'mgstage', 'mywife', 'prestige', @@ -513,6 +515,8 @@ class ManualConfig: 'mdtv': 'mdtv', 'mdpjzip': 'mdtv', 'madouqu': 'madouqu', + 'hsck': 'hscangku', + 'cableav': 'cableav', 'mgstage': 'mgstage', '7mmtv': '7mmtv', 'bb9711': '7mmtv', diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py index 4e3d078..aebeba1 100644 --- a/src/models/core/crawler.py +++ b/src/models/core/crawler.py @@ -11,7 +11,7 @@ from models.core.flags import Flags from models.crawlers import airav_cc_new, airav_new, avsex, avsox, cnmdb, dahlia, dmm, faleno, fantastica, fc2, fc2club, \ fc2hub, freejavbt, getchu, getchu_dmm, giga, hdouban, iqqtv_new, jav321, javbus, javdb, javlibrary_new, kin8, love6, \ - lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity + lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity, hscangku, cableav from models.entity.enums import FileMode @@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai elif website == 'mgstage': json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number)) elif website == '7mmtv': - json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language)) + json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path)) elif website == 'fc2': json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language)) elif website == 'fc2hub': @@ -137,6 +137,12 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai elif website == 'madouqu': json_data = json.loads( madouqu.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) + elif website == 'hscangku': + json_data = json.loads( + hscangku.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) + elif website == 'cableav': + json_data = json.loads( + cableav.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) elif website == 'getchu': json_data = json.loads(getchu.main(file_number, appoint_url, log_info, req_web, language)) elif website == 'getchu_dmm': diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py index a0467e0..bccd5a7 100644 --- a/src/models/core/nfo.py +++ b/src/models/core/nfo.py @@ -81,8 +81,10 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal nfo_title = config.naming_media if not number: number = title + # 默认emby视频标题配置为 [number title],国产重复时需去掉一个,去重需注意空格也应一起去掉,否则国产的nfo标题中会多一个空格 + # 读取nfo title信息会去掉前面的number和空格以保留title展示出来,同时number和标题一致时,去掉number的逻辑变成去掉整个标题导致读取失败,见426行 if number == title and 'number' in nfo_title and 'title' in nfo_title: - nfo_title = nfo_title.replace('originaltitle', '').replace('title', '') + nfo_title = nfo_title.replace('originaltitle', '').replace('title', '').strip() first_letter = get_number_first_letter(number) # 处理演员 @@ -106,7 +108,7 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal if not os.path.exists(folder_new_path): os.makedirs(folder_new_path) delete_file(nfo_new_path) # 避免115出现重复文件 - with open(nfo_new_path, "wt", encoding='UTF-8') as code: + with (open(nfo_new_path, "wt", encoding='UTF-8') as code): print('', file=code) print("", file=code) @@ -205,18 +207,27 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal if 'country,' in nfo_include_new: print(f" {country}", file=code) - # 输出演员 + # 初始化 actor_list + actor_list = [] + # 输出男女演员 if 'actor_all,' in nfo_include_new: actor = all_actor - if actor and actor != '未知演员' and actor != '未知演員' and 'actor,' in nfo_include_new: + # 有演员时输出演员 + if 'actor,' in nfo_include_new and actor: actor_list = actor.split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 - if actor_list: - for each in actor_list: - print(" ", file=code) - print(" " + each + "", file=code) - print(" Actor", file=code) - print(" ", file=code) + # 无演员时输出演员 以文件命名设置中未知演员设置项为演员名,默认设置和空值不写入NFO + elif 'actor,' in nfo_include_new and config.actor_no_name not in ["未知演员", '未知演員', '']: + actor = config.actor_no_name + actor_list = actor.split(',') # 字符串转列表 + actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 + signal.add_log(f'⛑️ 无演员名, 使用手动命名 写入NFO {config.actor_no_name}') + if actor_list: + for each in actor_list: + print(" ", file=code) + print(" " + each + "", file=code) + print(" Actor", file=code) + print(" ", file=code) # 输出导演 if director and 'director,' in nfo_include_new: @@ -318,10 +329,11 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal print(" " + website + "", file=code) # javdb id 输出, 没有时使用番号搜索页 - if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']: - print(" " + json_data_nfo["javdbid"] + "", file=code) - else: - print(" " + number + "", file=code) + if "国产" not in json_data_nfo['mosaic'] and "國產" not in json_data_nfo['mosaic']: + if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']: + print(" " + json_data_nfo["javdbid"] + "", file=code) + else: + print(" " + number + "", file=code) print("", file=code) json_data['logs'] += "\n 🍀 Nfo done! (new)(%ss)" % get_used_time(start_time) return True diff --git a/src/models/crawlers/cableav.py b/src/models/crawlers/cableav.py new file mode 100644 index 0000000..f7fd337 --- /dev/null +++ b/src/models/crawlers/cableav.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import re +import time + +import urllib3 +import zhconv +from lxml import etree + +from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_extra_info, get_number_list + +urllib3.disable_warnings() # yapf: disable + + +# import traceback + +def get_actor_photo(actor): + actor = actor.split(',') + data = {} + for i in actor: + actor_photo = {i: ''} + data.update(actor_photo) + return data + + +def get_detail_info(html, number, file_path): + title_h1 = html.xpath('//div[@class="entry-content "]/p/text()') + title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number + actor = get_extra_info(title, file_path, info_type="actor") + tmp_tag = html.xpath('//header//div[@class="categories-wrap"]/a/text()') + # 标签转简体 + tag = zhconv.convert(tmp_tag[0], 'zh-cn') if tmp_tag else '' + cover_url = html.xpath(f'//meta[@property="og:image"]/@content') + cover_url = cover_url[0] if cover_url else '' + + return number, title, actor, cover_url, tag + + +def get_real_url(html, number_list): + item_list = html.xpath('//h3[contains(@class,"title")]//a[@href and @title]') + for each in item_list: + # href="https://cableav.tv/Xq1Sg3SvZPk/" + detail_url = each.get('href') + title = each.xpath('text()')[0] + if title and detail_url: + for n in number_list: + temp_n = re.sub(r'[\W_]', '', n).upper() + temp_title = re.sub(r'[\W_]', '', title).upper() + if temp_n in temp_title: + return True, n, title, detail_url + return False, '', '', '' + + +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''): + start_time = time.time() + website_name = 'cableav' + req_web += '-> %s' % website_name + title = '' + cover_url = '' + web_info = '\n ' + log_info += ' \n 🌐 cableav' + debug_info = '' + real_url = appoint_url + cableav_url = getattr(config, 'cableav_website', 'https://cableav.tv') + + try: + if not real_url: + # 处理番号 + number_list, filename_list = get_number_list(number, appoint_number, file_path) + n_list = number_list[:1] + filename_list + for each in n_list: + real_url = f'{cableav_url}/?s={each}' + # real_url = 'https://cableav.tv/s?s=%E6%9F%9A%E5%AD%90%E7%8C%AB' + debug_info = f'请求地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + if not result: + debug_info = '网络请求错误: %s' % response + log_info += web_info + debug_info + raise Exception(debug_info) + search_page = etree.fromstring(response, etree.HTMLParser()) + result, number, title, real_url = get_real_url(search_page, n_list) + # real_url = 'https://cableav.tv/hyfaqwfjhio' + if result: + break + else: + debug_info = '没有匹配的搜索结果' + log_info += web_info + debug_info + raise Exception(debug_info) + + debug_info = f'番号地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '没有找到数据 %s ' % response + log_info += web_info + debug_info + raise Exception(debug_info) + + detail_page = etree.fromstring(response, etree.HTMLParser()) + number, title, actor, cover_url, tag = get_detail_info(detail_page, number, file_path) + actor_photo = get_actor_photo(actor) + + try: + dic = { + 'number': number, + 'title': title, + 'originaltitle': title, + 'actor': actor, + 'outline': '', + 'originalplot': '', + 'tag': tag, + 'release': '', + 'year': '', + 'runtime': '', + 'score': '', + 'series': '', + 'country': 'CN', + 'director': '', + 'studio': '', + 'publisher': '', + 'source': 'cableav', + 'website': real_url, + 'actor_photo': actor_photo, + 'cover': cover_url, + 'poster': '', + 'extrafanart': '', + 'trailer': '', + 'image_download': False, + 'image_cut': 'no', + 'log_info': log_info, + 'error_info': '', + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + 'mosaic': '国产', + 'wanted': '', + } + debug_info = '数据获取成功!' + log_info += web_info + debug_info + dic['log_info'] = log_info + except Exception as e: + debug_info = '数据生成出错: %s' % str(e) + log_info += web_info + debug_info + raise Exception(debug_info) + + except Exception as e: + # print(traceback.format_exc()) + debug_info = str(e) + dic = { + 'title': '', + 'cover': '', + 'website': '', + 'log_info': log_info, + 'error_info': debug_info, + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + } + dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}} + js = json.dumps( + dic, + ensure_ascii=False, + sort_keys=False, + indent=4, + separators=(',', ': '), + ) + return js + + +if __name__ == '__main__': + # yapf: disable + # print(main('SSN010')) + # print(main('國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露', file_path='國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露')) + # print(main('國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜', file_path='國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜')) + # print(main('韓國高端攝影頂 Yeha 私拍福利', file_path='韓國高端攝影頂 Yeha 私拍福利')) + print(main('EMTC-005', file_path='國產AV 愛神傳媒 EMTC005 怒操高冷社長秘書 米歐')) diff --git a/src/models/crawlers/guochan.py b/src/models/crawlers/guochan.py index c1c08bd..da3d20e 100644 --- a/src/models/crawlers/guochan.py +++ b/src/models/crawlers/guochan.py @@ -7,6 +7,8 @@ import urllib3 import zhconv +from models.base.number import remove_escape_string + urllib3.disable_warnings() # yapf: disable @@ -15,219 +17,119 @@ def get_lable_list(): - return ['传媒', '国产短视频', '国产精品', '国产AV', 'PsychoPorn色控', '叮叮映画', '涩会', '蜜桃影像传媒', - '大番号番啪啪', 'REAL野性派', '豚豚创媒', '宫美娱乐', '肉肉传媒', '爱妃传媒', '天美传媒', '皇家华人', - '91制片厂', '果冻传媒', 'O-STAR', '兔子先生', '杏吧原创', '杏吧独家', '辣椒原创', '麻豆传媒', '糖心', - '麻豆传媒映画', '红斯灯影像', '绝对领域', '麻麻传媒', '渡边传媒', 'AV帝王', '桃花源', '蝌蚪传媒', 'SWAG', - '麻豆', '杏吧'] + return ["麻豆传媒", "91茄子", "Ed Mosaic", "HongKongDoll", "JVID", "MINI传媒", "SA国际传媒", "TWAV", "乌鸦传媒", + "乐播传媒", "优蜜传媒", "偶蜜国际", "叮叮映画", "哔哩传媒", "大象传媒", "天美传媒", "开心鬼传媒", "微密圈", + "扣扣传媒", "抖阴传媒", "星空无限传媒", "映秀传媒", "杏吧传媒", "果冻传媒", "模密传媒", "爱污传媒", + "爱神传媒", + "爱豆传媒", "狂点映像", "猛料原创", "猫爪影像", "皇家华人", "精东影业", "糖心VLOG", "维秘传媒", "草莓视频", + "萝莉社", + "蜜桃传媒", "西瓜影视", "起点传媒", "香蕉视频", "PsychoPorn色控", "蜜桃影像传媒", "大番号番啪啪", + "REAL野性派", "豚豚创媒", + "宫美娱乐", "肉肉传媒", "爱妃传媒", "91制片厂", "O-STAR", "兔子先生", "杏吧原创", "杏吧独家", "辣椒原创", + "麻豆传媒映画", "红斯灯影像", + "绝对领域", "麻麻传媒", "渡边传媒", "AV帝王", "桃花源", "蝌蚪传媒", "SWAG", "麻豆", "杏吧", "糖心", + "国产短视频", "国产精品", "国产AV", "涩会"] def get_actor_list(): - return [ - '苏妲己', - '苏畅', - '宁洋子', - '沈芯语', - '艾秋', - '吴梦梦', - '尹媚', - '张芸熙', - '夏晴子', - '白佳萱', - '林思妤', - '沈娜娜', - '仙儿媛', - '许安妮', - '刘语珊', - '刘思慧', - '叶一涵', - '林亦涵', - '雪千夏', - '欧美玲', - '赵佳美', - '李慕儿', - '徐韵珊', - '苏娅', - '糖果屋', - '王茜', - '李婧琪', - '夏滢', - '顾伊梦', - '杜冰若', - '赵颖儿', - '秦可欣', - '莫安安', - '安娜', - '黎星若', - '仙儿', - '林予曦', - '蒋佑怡', - '许书曼', - '白晶晶', - '王有容', - '琳希', - '李恩琦', - '赵美凤', - '王欣', - '徐筱欣', - '黄雅曼', - '伊靖瑶', - '菲于娜', - '罗瑾萱', - '金丞熙', - '李文雯', - '苏清歌', - '付妙菱', - '钟丽琪', - '张娅庭', - '蜜苏', - '凌薇', - '叶凡舒', - '董小宛', - '程雨沫', - '瑶贝', - '郭瑶瑶', - '李嘉欣', - '辰悦', - '李曼妮', - '洛雪', - '千鹤', - '袁庭妮', - '林思好', - '张云熙', - '杜鹃', - '玛丽莲', - '李茹', - '何苗', - '黄雪纯', - '田恬', - '李琼', - '聂小倩', - '张晴', - '丁香', - '林凤娇', - '刘颖儿', - '杨思敏', - '李忆彤', - '伊蒂丝', - '绿帽先生', - '戚小怜', - '杨柳', - '唐茜', - '苏艾文', - '张曼青', - '斑斑', - '孟若羽', - '陈圆圆', - '雷梦娜', - '氖氖', - '仙儿', - '艾玛', - '蔚曼', - '静静', - '艾瑞卡', - '娜米', - '莉娜', - '乔安', - '林子涵', - '萱萱', - '糖糖', - '徐婕', - '王欣', - '白颖', - '吴芮瑜', - '韩棠', - '季妍希', - '沙耶香', - '七七', - '莉娜乔安', - '美雪樱', - '柚木结爱', - '黑田奈奈', - '王亦舒', - '张雅婷', - '李文静', - '肖泳', - '韩小雅', - '神山奈奈', - '白川麻衣', - '茜茜', - '夜夜', - '高梨遥香', - 'HongKongDoll', - '玩偶姐姐', - '蘇妲己', - '蘇暢', - '寧洋子', - '沈芯語', - '吳夢夢', - '張芸熙', - '仙兒媛', - '許安妮', - '劉語珊', - '劉思慧', - '葉一涵', - '歐美玲', - '趙佳美', - '李慕兒', - '徐韻珊', - '蘇婭', - '夏瀅', - '顧伊夢', - '趙穎兒', - '仙兒', - '蔣佑怡', - '許書曼', - '趙美鳳', - '黃雅曼', - '伊靖瑤', - '羅瑾萱', - '蘇清歌', - '鍾麗琪', - '張婭庭', - '蜜蘇', - '葉凡舒', - '瑤貝', - '郭瑤瑤', - '辰悅', - '千鶴', - '張雲熙', - '杜鵑', - '瑪麗蓮', - '黃雪純', - '李瓊', - '聶小倩', - '張晴', - '林鳳嬌', - '劉穎兒', - '楊思敏', - '李憶彤', - '伊蒂絲', - '綠帽先生', - '戚小憐', - '楊柳', - '蘇艾文', - '張曼青', - '陳圓圓', - '雷夢娜', - '仙兒', - '艾瑪', - '靜靜', - '喬安', - '白穎', - '吳芮瑜', - '韓棠', - '莉娜喬安', - '美雪櫻', - '柚木結愛', - '張雅婷', - '李文靜', - '韓小雅', - '高梨遙香', - ] + return ['Madison Summers', 'Spencer Bradley', 'Madison Morgan', 'Rosalyn Sphinx', 'Braylin Bailey', + 'Whitney Wright', 'Victoria Voxxx', 'Alexia Anders', + 'Bella Rolland', 'Violet Myers', 'Sophia Leone', 'Violet Starr', 'Eliza Ibarra', 'HongKongDoll', + 'Keira Croft', 'April Olsen', 'Avery Black', + 'Amber Moore', 'Anny Aurora', 'Skylar Snow', 'Harley Haze', 'Paige Owens', 'Vanessa Sky', 'MasukuChan', + 'Kate Bloom', 'Kimmy Kimm', 'Ana Foxxx', + 'Lexi Luna', 'Gia Derza', 'Skye Blue', 'Nico Love', 'Alyx Star', 'Ryan Reid', 'Kira Noir', 'Karma Rx', + '下面有根棒棒糖', 'Vivian姐', 'COLA酱', + 'cola醬', 'Stacy', 'ROXIE', '真木今日子', '小七软同学', 'Chloe', 'Alona', '小日向可怜', 'NANA', '玩偶姐姐', + '粉色情人', '桥本香菜', '冉冉学姐', '小二先生', + '饼干姐姐', 'Rona', '不见星空', '米娜学姐', '阿蛇姐姐', '樱花小猫', '樱井美里', '宸荨樱桃', '樱空桃桃', + '牛奶泡芙', '91兔兔', '棉花糖糖', '桥本爱菜', + '许木学长', 'MOMO', '驯鹿女孩', '高梨遥香', 'DORY', '冬月结衣', 'Aida', '香菜公主', '藤田美绪', '浅尾美羽', + '天音美纱', '中条爱莉', '三月樱花', 'Emma', + 'Vita', '千夜喵喵', '水原圣子', '白川麻衣', '池田奈美', '西村莉娜', 'A天使爱', '中野惠子', '麻衣CC', + '樱桃空空', 'LENA', '小泽纱香', '木下日葵', '中岛芳子', + '弥生美月', '逢见梨花', '宇佐爱花', '沙月芽衣', '羽月萌音', '前田由美', '伊东爱瑠', 'Misa', '绿帽先生', + '莉娜乔安', '柚木结爱', '黑田奈奈', '神山奈奈', + '孟若羽', '夏晴子', '吴梦梦', '沈娜娜', '李蓉蓉', '林思妤', '仙儿媛', '金宝娜', '季妍希', '温芮欣', + '吴文淇', '苏语棠', '秦可欣', '吴芳宜', '李娜娜', + '乐奈子', '宋南伊', '小水水', '白允儿', '管明美', '雪千夏', '苏清歌', '玥可岚', '梁芸菲', '白熙雨', + '小敏儿', '楚梦舒', '柚子猫', '姚宛儿', '宋雨川', + '舒可芯', '苏念瑾', '白沛瑶', '林沁儿', '唐雨菲', '李允熙', '张芸熙', '寻小小', '白靖寒', '钟宛冰', + '李薇薇', '米菲兔', '雷梦娜', '董悦悦', '袁子仪', + '赖畇希', '王以欣', '夏禹熙', '狐不妖', '凌波丽', '黎芷萱', '陆斑比', '辛尤里', '小猫咪', '顾桃桃', + '南芊允', '岚湘庭', '林芊彤', '梁佳芯', '林凤娇', + '明日香', '绫波丽', '邓紫晴', '赵一曼', '吴茜茜', '锅锅酱', '倪哇哇', '潘雨曦', '吴恺彤', '美杜莎', + '郭童童', '陈可心', '莫夕慈', '沈芯语', '董小宛', + '苏艾文', '翁雨澄', '赵晓涵', '小桃酱', '宋东琳', '香月怜', '李文雯', '白若冰', '徐夜夜', '真希波', + '爱丽丝', '张宇芯', '金善雅', '李依依', '苏安亚', + '奶咪酱', '白葵司', '罗瑾萱', '宁洋子', '小夜夜', '白晶晶', '张雅婷', '吴心语', '林曼芸', '项子甯', + '吳芳宜', '苏小小', '文冰冰', '韩宝儿', '白星雨', + '林怡梦', '张欣妍', '七濑恋', '白思吟', '吴凯彤', '溫芮欣', '林可菲', '黎芷媗', '御梦子', '苏雨彤', + '古伊娜', '聂小倩', '陈圆圆', '沙美辰', '林妙可', + '乐淆雪', '李恩娜', '周晴晴', '杨思敏', '李曼妮', '戚小怜', '谢语彤', '王筱璐', '卢珊珊', '程诗诗', + '林玥玥', '白瞳瞳', '魏乔安', '米胡桃', '施子涵', + '北野爱', '杜冰若', '玛丽莲', '胡蓉蓉', '万静雪', '花语柔', '萧悦儿', '林晓雪', '兰心洁', '神谷怜', + '唐雨霏', '鸡蛋饼', '沈湘妮', '费爵娜', '小美惠', + '大奶露', '向若云', '苏小沫', '榨汁姬', '陈星然', '夏雨荷', '姚彤彤', '莫云雪', '艾瑞卡', '黄雪纯', + '赵雅琳', '叶宸欣', '伊琬琳', '陈美惠', '金巧巧', + '陈美琳', '陆思涵', '顾小北', '陈小雨', '维里娜', '兔小白', '叶子红', '美凉子', '李丹彤', '李微微', + '白婷婷', '艾米酱', '刘小姗', '白童童', '张琪琪', + 'Yua', '小玩子', '岚可彤', '都可可', '李慕儿', '叶一涵', '赵佳美', '董小婉', '钟丽琪', '韩小雅', '杨朵儿', + '叶梦语', '程雨沫', '张曼青', '纪妍希', '伊婉琳', + '凌雨萱', '潘甜甜', '美竹玲', '韩依人', '奈奈子', '林雪漫', '宋甜甜', '陆雪琪', '宋妮可', '陆子欣', + '范可可', '许依然', '苏小新', '蒋梦琳', '李可欣', + '小鹿酱', '小林杏', '陶杏儿', '明步奈', '苏宁儿', '白潼潼', '增田枫', '特污兔', '何安汝', '倪菀儿', + '唐可可', '口罩酱', '小千绪', '糖糖儿', '许安妮', + '李婧琪', '刘思慧', '欧阳晶', '欧美玲', '林亦涵', '钟以彤', '许书曼', '付妙菱', '伊靖瑶', '张娅庭', + '韩小野', '宫泽蓝', '冯思雨', '林小樱', '刘颖儿', + '莫潇潇', '胡心瑶', '林雨露', '苏婧薇', '许月珍', '陈若瑶', '吴芮瑜', '叶如梦', '刘依依', '吴语菲', + '张妮妮', '林子涵', '张子瑜', '周卿卿', '李师师', + '苏语堂', '方紫璐', '袁采菱', '刘清韵', '李曼丽', '刘小雯', '姬咲华', '高小颜', '蔡晓雨', '梁如意', + '林语桐', '王小妮', '唐月琴', '星谷瞳', '何小丽', + '张婉妍', '酒井爱', '张秀玲', '晚晚酱', '薛梦琪', '李乐乐', '张佳晨', '程媛媛', '沐语柔', '安琪拉', + '韩倪希', '苏妲己', '白佳萱', '刘语珊', '徐韵珊', + '糖果屋', '顾伊梦', '赵颖儿', '莫安安', '黎星若', '林予曦', '蒋佑怡', '王有容', '李恩琦', '赵美凤', + '徐筱欣', '黄雅曼', '菲于娜', '金丞熙', '叶凡舒', + '郭瑶瑶', '李嘉欣', '袁庭妮', '林思好', '张云熙', '李忆彤', '伊蒂丝', '沙耶香', '美雪樱', '王亦舒', + '李文静', '鸡教练', '斑斑', '坏坏', '糖糖', '艾秋', + '凌薇', '莉娜', '韩棠', '苡若', '尤莉', '优娜', '林嫣', '徐蕾', '周甯', '唐茜', '香菱', '佳芯', '湘湘', + '米欧', '斑比', '蜜苏', '小婕', '艾熙', '娃娃', + '艾玛', '雪霏', '夜夜', '欣欣', '乔安', '羽芮', '美酱', '师师', '玖玖', '橙子', '晨曦', '苏娅', '黎儿', + '晨晨', '嘉洛', '小遥', '苏畅', '琪琪', '苡琍', + '李慕', '心萱', '昀希', '黎娜', '乐乐', '樱桃', '桐桐', '苏璇', '安娜', '悠悠', '茉莉', '宛冰', '静静', + '丝丝', '菲菲', '樱樱', '波妮', '唐芯', '小野', + '何苗', '甜心', '瑶瑶', '小捷', '薇薇', '美樱', '宁静', '欧妮', '吉吉', '小桃', '绯丽', '嘉琪', '咪妮', + '雯茜', '小洁', '李琼', '唐霏', '岚玥', '熙熙', + '米娅', '舒舒', '斯斯', '欣怡', '妍儿', '阿雅', '宋可', '畇希', '柔伊', '雅沁', '惠敏', '露露', '艾悠', + '娜娜', '李娜', '肖云', '王玥', '林洋', '清洛', + '艾鲤', '依涵', '半雪', '琦琦', '莎莎', '小冉', '琳怡', '莉奈', '梅子', '啤儿', '瑶贝', '杨柳', '童汐', + '米亚', '琳达', '晴天', 'KK', '紫宸', '淑怡', + '花花', '金铭', '程葳', '妍希', '咪妃', '茜茜', '小蜜', '凌萱', '觅嫣', '涵涵', '欲梦', '美琳', '杜鹃', + '许诺', '兮兮', '白鹿', '虞姬', '丽萨', '蔷薇', + '小影', '优优', '茶茶', '可儿', '甜甜', '憨憨', '波尼', '依颂', '依依', '思思', '芳情', '月牙', '小爱', + '淳儿', '苗方', '茶理', '苹果', '苏然', '陶子', + '董欣', '羽熙', '清沐', '林襄', '娃诺', '洁咪', '小昭', '球球', '紫萱', '南兰', '安琪', '可乐', '夏露', + '诗琪', '陈韵', '丽娜', '苏旋', '月月', '石榴', + '米兰', '恩恩', '西子', '芷萱', '酥酥', '王茜', '千鹤', '雪见', '姜洁', '张晴', '辰悦', '丁香', '白颖', + '穆娜', '小芳', '吉娜', '秋霞', '无双', '夏宝', + '舒涵', '小柔', '小小', '璇元', '米砂', '余丽', '美嘉', '莉莉', '奈奈', '黑糖', '晴子', '多乙', '徐婕', + '闵闵', '小雪', '洋洋', '明儿', '苏茜', '芯怡', + '姚茜', '百合', '婉婷', '小乔', '芽芽', '婕珍', '乔乔', '紫寒', '小薇', '菜菜', '洁米', '夏天', '灵枝', + '语伊', '徐艳', '王佩', '希汶', '雅捷', '喵喵', + '尤奈', '仙儿', '氖氖', '蔚曼', '田恬', '颂潮', '小婵', '千凌', '李燕', '林芳', '杨桃', '艾莉', '落落', + '冯雪', '王蓉', '妖妖', '雨晨', '心雪', '穆雪', + '韩焉', '邱月', '檀雅', '柯柯', '七七', '鱼儿', '丹丹', '简一', '淑仪', '小哇', '朵儿', '妲己', '云朵', + '唐菲', '邦妮', '白英', '夏夏', '安安', '小艺', + '丽丽', '敏敏', '空空', '椿芽', '小言', '李蕊', '水水', '小鱼', '艾艾', '尹媚', '夏滢', '琳希', '王欣', + '洛雪', '李茹', '娜米', '萱萱', '肖泳'] def get_number_list(number, appoint_number='', file_path=''): # 处理国产番号 + + # 国产匹配番号或标题前也可以先排除路径中多余字符 + if file_path: + file_path = remove_escape_string(file_path) + file_name = os.path.splitext(os.path.split(file_path)[1])[0].upper() if file_path else '' number = number.upper() number_list = [] # 返回一个番号列表,用来搜索 @@ -357,7 +259,10 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番 number_list.extend([number_normal_4, number_has_nothing_4, number_has_space_4]) if len(number_list): break + # 番号识别将纯数字和字母放在最前面(将长度最短的放前面即可),刮削网站一般也只取 number_list 第一项进行搜索,其他用于搜索结果页比对 + sorted_number_list = sorted(number_list, key=lambda x: len(x)) + # 以下处理没有番号的作品 # 台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品 # PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画 # 國産麻豆AV 麻豆番外 大番號女優空降上海 特別篇 沈芯語 @@ -403,17 +308,64 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番 # 把文件名加到列表 filename_list.append(real_file_name) + # 演员后面的第一句成功刮削概率较高,插入列表第一项 + # 超级丝袜控180大长腿女神▌苹果▌我的室友 第八篇 黑丝女仆骚丁小穴湿淋淋 肉棒塞满激怼爆射 + # 17205-最新极品天花板小萝莉▌粉色情人▌摄影师的威胁 粗屌爆艹少女白虎嫩鲍 极速刮擦蜜壶淫靡下体 + # 潮喷淫娃御姐〖小水水〗和异地大奶女友开房,激情互舔口爆高潮喷水,黑丝美腿女神极度淫骚 潮喷不停 + # 极品爆乳鲜嫩美穴貌美尤物▌苏美奈▌家政女仆的肉体服务 肏到羞耻喷汁 极射中出鲜嫩美穴 + # 【小酒改头换面】,罕见大胸嫩妹,小伙今夜捡到宝了 + if u := re.search(r'(【.+】|▌.+▌|〖.+〗|『.+』)[,,\- ]?(\S{6,18}?)[,,\- ]', real_file_name): + search_char = u.group(2) + filename_list.insert(0, search_char) + # 转繁体 filename_list.append(zhconv.convert(filename_list[0], 'zh-hant')) # 去重去空 new_number_list = [] new_filename_list = [] - [new_number_list.append(i) for i in number_list if i and i not in new_number_list] + [new_number_list.append(i) for i in sorted_number_list if i and i not in new_number_list] [new_filename_list.append(i) for i in filename_list if i and i not in new_filename_list] return new_number_list, new_filename_list +def get_extra_info(title, file_path, info_type, tag='', actor='', series=''): + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配,忽略大小写 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if re.search(f'{each}', all_info, re.IGNORECASE): + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里,人名完全匹配 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if re.search(fr'\b{each}\b', all_info, re.IGNORECASE): + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配,没有相关数据,预留逻辑 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info.upper(): + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) + + if __name__ == '__main__': # yapf: disable # get_number_list('Md0165-4') diff --git a/src/models/crawlers/hscangku.py b/src/models/crawlers/hscangku.py new file mode 100644 index 0000000..ba6dcad --- /dev/null +++ b/src/models/crawlers/hscangku.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import re +import time + +import urllib3 +from lxml import etree + +from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_extra_info, get_number_list + +urllib3.disable_warnings() # yapf: disable + + +# import traceback + +def get_actor_photo(actor): + actor = actor.split(',') + data = {} + for i in actor: + actor_photo = {i: ''} + data.update(actor_photo) + return data + + +def get_detail_info(html, real_url, number, file_path): + href = re.split(r'[/.]', real_url)[-2] + title_h1 = html.xpath( + '//h3[@class="title" and not(contains(normalize-space(.), "目录")) and not(contains(normalize-space(.), "为你推荐"))]/text()') + title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number + actor = get_extra_info(title, file_path, info_type="actor") + tag = get_extra_info(title, file_path, info_type="tag") + cover_url = html.xpath(f'//a[@data-original and contains(@href,"{href}")]/@data-original') + cover_url = cover_url[0] if cover_url else '' + + return number, title, actor, cover_url, tag + + +def get_real_url(html, number_list, hscangku_url): + item_list = html.xpath('//a[@class="stui-vodlist__thumb lazyload"]') + for each in item_list: + # href="/vodplay/41998-1-1.html" + detail_url = hscangku_url + each.get('href') + title = each.xpath('@title')[0] + if title and detail_url: + for n in number_list: + temp_n = re.sub(r'[\W_]', '', n).upper() + temp_title = re.sub(r'[\W_]', '', title).upper() + if temp_n in temp_title: + return True, n, title, detail_url + return False, '', '', '' + + +def get_redirected_url(url): + result, response = curl_html(url) + if not result: + return None + + if redirected_url := re.search(r'"(https?://.*?)"', response).group(1): + http = urllib3.PoolManager() + response = http.request('GET', f'{redirected_url}{url}&p=', redirect=False) + final_url = response.get_redirect_location() + return final_url if final_url else None + else: + return None + + +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''): + start_time = time.time() + website_name = 'hscangku' + req_web += '-> %s' % website_name + title = '' + cover_url = '' + web_info = '\n ' + log_info += ' \n 🌐 hscangku' + debug_info = '' + real_url = appoint_url + hscangku_url = getattr(config, 'hscangku_website', 'http://hsck.net') + + try: + if not real_url: + # 处理番号 + number_list, filename_list = get_number_list(number, appoint_number, file_path) + n_list = number_list[:1] + filename_list + # 处理重定向 + hscangku_url = get_redirected_url(hscangku_url) + if not hscangku_url: + debug_info = '没有正确的 hscangku_url,无法刮削' + log_info += web_info + debug_info + raise Exception(debug_info) + for each in n_list: + real_url = f'{hscangku_url}/vodsearch/-------------.html?wd={each}&submit=' + # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit=' + debug_info = f'请求地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '网络请求错误: %s' % response + log_info += web_info + debug_info + raise Exception(debug_info) + search_page = etree.fromstring(response, etree.HTMLParser()) + result, number, title, real_url = get_real_url(search_page, n_list, hscangku_url) + # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit=' + if result: + break + else: + debug_info = '没有匹配的搜索结果' + log_info += web_info + debug_info + raise Exception(debug_info) + + debug_info = f'番号地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '没有找到数据 %s ' % response + log_info += web_info + debug_info + raise Exception(debug_info) + + detail_page = etree.fromstring(response, etree.HTMLParser()) + number, title, actor, cover_url, tag = get_detail_info(detail_page, real_url, number, file_path) + actor_photo = get_actor_photo(actor) + + try: + dic = { + 'number': number, + 'title': title, + 'originaltitle': title, + 'actor': actor, + 'outline': '', + 'originalplot': '', + 'tag': tag, + 'release': '', + 'year': '', + 'runtime': '', + 'score': '', + 'series': '', + 'country': 'CN', + 'director': '', + 'studio': '', + 'publisher': '', + 'source': 'hscangku', + 'website': real_url, + 'actor_photo': actor_photo, + 'cover': cover_url, + 'poster': '', + 'extrafanart': '', + 'trailer': '', + 'image_download': False, + 'image_cut': 'no', + 'log_info': log_info, + 'error_info': '', + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + 'mosaic': '国产', + 'wanted': '', + } + debug_info = '数据获取成功!' + log_info += web_info + debug_info + dic['log_info'] = log_info + except Exception as e: + debug_info = '数据生成出错: %s' % str(e) + log_info += web_info + debug_info + raise Exception(debug_info) + + except Exception as e: + # print(traceback.format_exc()) + debug_info = str(e) + dic = { + 'title': '', + 'cover': '', + 'website': '', + 'log_info': log_info, + 'error_info': debug_info, + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + } + dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}} + js = json.dumps( + dic, + ensure_ascii=False, + sort_keys=False, + indent=4, + separators=(',', ': '), + ) + return js + + +if __name__ == '__main__': + # yapf: disable + # print(main('大像传媒之淫蕩刺青女學徒', file_path='大像传媒之淫蕩刺青女學徒')) + # print(main('冠希传媒GX-017强上弟弟的巨乳姐姐', file_path='冠希传媒GX-017强上弟弟的巨乳姐姐')) + # print(main('[SWAG]XHX-0014宅男的公仔幻化成人', file_path='[SWAG]XHX-0014宅男的公仔幻化成人')) + # print(main('IDG5401')) + print(main('大像传媒之長腿癡女代表情慾作-米歐', file_path='大像传媒之長腿癡女代表情慾作-米歐')) diff --git a/src/models/crawlers/jav321.py b/src/models/crawlers/jav321.py index 1400f35..098a212 100644 --- a/src/models/crawlers/jav321.py +++ b/src/models/crawlers/jav321.py @@ -168,10 +168,14 @@ def main(number, appoint_url='', log_info='', req_web='', language='jp'): series = getSeries(detail_page) extrafanart = getExtraFanart(detail_page) # 判断无码 - uncensorted_list = ['一本道', 'HEYZO', 'サムライポルノ', 'キャットウォーク', 'サイクロン', 'ルチャリブレ', 'スーパーモデルメディア', 'スタジオテリヤキ', - 'レッドホットコレクション', 'スカイハイエンターテインメント', '小天狗', 'オリエンタルドリーム', 'Climax Zipang', 'CATCHEYE', - 'ファイブスター', 'アジアンアイズ', 'ゴリラ', 'ラフォーレ ガール', 'MIKADO', 'ムゲンエンターテインメント', 'ツバキハウス', 'ザーメン二郎', - 'トラトラトラ', 'メルシーボークー', '神風', 'Queen 8', 'SASUKE', 'ファンタドリーム', 'マツエンターテインメント', 'ピンクパンチャー', + uncensorted_list = ['一本道', 'HEYZO', 'サムライポルノ', 'キャットウォーク', 'サイクロン', 'ルチャリブレ', + 'スーパーモデルメディア', 'スタジオテリヤキ', + 'レッドホットコレクション', 'スカイハイエンターテインメント', '小天狗', + 'オリエンタルドリーム', 'Climax Zipang', 'CATCHEYE', + 'ファイブスター', 'アジアンアイズ', 'ゴリラ', 'ラフォーレ ガール', 'MIKADO', + 'ムゲンエンターテインメント', 'ツバキハウス', 'ザーメン二郎', + 'トラトラトラ', 'メルシーボークー', '神風', 'Queen 8', 'SASUKE', 'ファンタドリーム', + 'マツエンターテインメント', 'ピンクパンチャー', 'ワンピース', 'ゴールデンドラゴン', 'Tokyo Hot', 'Caribbean'] for each in uncensorted_list: if each == studio: diff --git a/src/models/crawlers/javbus.py b/src/models/crawlers/javbus.py index 7eefd47..90062a4 100644 --- a/src/models/crawlers/javbus.py +++ b/src/models/crawlers/javbus.py @@ -174,7 +174,7 @@ def getTag(html): # 获取标签 return result -def get_real_url(number, url_type, javbus_url, json_log, headers, cookie): # 获取详情页链接 +def get_real_url(number, url_type, javbus_url, json_log, headers, cookie): # 获取详情页链接 if url_type == 'us': # 欧美 url_search = 'https://www.javbus.hair/search/' + number elif url_type == 'censored': # 有码 diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py index 33c2b92..4f4bca5 100644 --- a/src/models/crawlers/madouqu.py +++ b/src/models/crawlers/madouqu.py @@ -8,14 +8,14 @@ from lxml import etree from models.base.web import curl_html -from models.crawlers.guochan import get_number_list +from models.config.config import config +from models.crawlers.guochan import get_extra_info, get_number_list urllib3.disable_warnings() # yapf: disable # import traceback - def get_actor_photo(actor): actor = actor.split(',') data = {} @@ -25,7 +25,7 @@ def get_actor_photo(actor): return data -def get_detail_info(html, number): +def get_detail_info(html, number, file_path): detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//p//text()') # detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//text()') title_h1 = html.xpath('//div[@class="cao_entry_header"]/header/h1/text()') @@ -48,6 +48,7 @@ def get_detail_info(html, number): cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src') cover_url = cover_url[0] if cover_url else '' # print(number, title, actor, cover_url, studio, detail_info) + actor = get_extra_info(title, file_path, info_type="actor") if actor == '' else actor return number, title, actor, cover_url, studio @@ -55,7 +56,8 @@ def get_real_url(html, number_list): item_list = html.xpath('//div[@class="entry-media"]/div/a') for each in item_list: detail_url = each.get('href') - title = each.xpath('img[@class="lazyload"]/@alt')[0] + # lazyload属性容易改变,去掉也能拿到结果 + title = each.xpath('img[@class]/@alt')[0] if title and detail_url: for n in number_list: temp_n = re.sub(r'[\W_]', '', n).upper() @@ -75,6 +77,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file log_info += ' \n 🌐 madouqu' debug_info = '' real_url = appoint_url + madouqu_url = getattr(config, 'madouqu_website', False) try: if not real_url: @@ -82,7 +85,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file number_list, filename_list = get_number_list(number, appoint_number, file_path) n_list = number_list[:1] + filename_list for each in n_list: - real_url = f'https://madouqu.com/?s={each}' + real_url = f'{madouqu_url}/?s={each}' if madouqu_url else f'https://madouqu.com/?s={each}' # real_url = 'https://madouqu.com/?s=XSJ-138.%E5%85%BB%E5%AD%90%E7%9A%84%E7%A7%98%E5%AF%86%E6%95%99%E5%AD%A6EP6' debug_info = f'请求地址: {real_url} ' log_info += web_info + debug_info @@ -111,7 +114,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file raise Exception(debug_info) detail_page = etree.fromstring(response, etree.HTMLParser()) - number, title, actor, cover_url, studio = get_detail_info(detail_page, number) + number, title, actor, cover_url, studio = get_detail_info(detail_page, number, file_path) actor_photo = get_actor_photo(actor) try: diff --git a/src/models/crawlers/mdtv.py b/src/models/crawlers/mdtv.py index 94efea8..511b861 100644 --- a/src/models/crawlers/mdtv.py +++ b/src/models/crawlers/mdtv.py @@ -40,6 +40,8 @@ def get_some_info(html, title, file_path): # 未找到演员时,看热门演员是否在标题和各种信息里 series = series_list[0] if series_list else '' tag = ','.join(tag_list) + actor_fake_name = any('未知' in item for item in actor_list) + actor_list = [] if actor_fake_name else actor_list if not actor_list: all_info = title + series + tag + file_path all_actor = get_actor_list() diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py index 0e7e143..ed565c0 100644 --- a/src/models/crawlers/mmtv.py +++ b/src/models/crawlers/mmtv.py @@ -9,6 +9,8 @@ from models.base.number import is_uncensored from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_extra_info urllib3.disable_warnings() # yapf: disable @@ -30,13 +32,15 @@ def get_title(html, web_number): return result[0].replace(web_number, '').strip() if result else '' -def get_actor(html): +def get_actor(html, title, file_path): actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()') actor = '' if actor_list: for each in actor_list: '''愛澄玲花,日高ゆりあ(青山ひより) 菜津子 32歳 デザイナー''' actor += re.sub(r'(.+)', '', each).split(' ')[0] + ',' + else: + actor = get_extra_info(title, file_path, info_type="actor") return actor.strip(',') @@ -134,7 +138,12 @@ def get_tag(html): def get_extrafanart(html): - result = html.xpath('//a[@class="screens-item fresco"]/@href') + # 前几张 + result1 = html.xpath('//span/img[contains(@class, "lazyload")]/@data-src') + # 其他隐藏需点击的 + if result2 := html.xpath('//div[contains(@class, "fullvideo")]/script[@language="javascript"]/text()'): + result2 = re.findall(r'https?://.+?\.jpe?g', str(result2)) + result = result1 + result2 return result if result else '' @@ -166,7 +175,7 @@ def get_number(html, number): return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number -def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''): start_time = time.time() website_name = '7mmtv' req_web += '-> %s' % website_name @@ -175,9 +184,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): web_info = '\n ' log_info += ' \n 🌐 7mmtv' debug_info = '' + mmtv_url = 'https://www.7mmtv.sx' + if hasattr(config, '7mmtv_website'): + mmtv_url = getattr(config, '7mmtv_website') real_url = appoint_url # search_url = "https://bb9711.com/zh/searchform_search/all/index.html" - search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + # search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + search_url = f"{mmtv_url}/zh/searchform_search/all/index.html" mosaic = '' try: @@ -186,7 +199,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): if number.upper().startswith('FC2'): search_keyword = re.findall(r'\d{3,}', number)[0] - search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search' + search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search' debug_info = f'搜索地址: {search_url} ' log_info += web_info + debug_info result, response = curl_html(search_url) @@ -220,7 +233,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): debug_info = '数据获取失败: 未获取到title!' log_info += web_info + debug_info raise Exception(debug_info) - actor = get_actor(html_info) + actor = get_actor(html_info, title, file_path) actor_photo = get_actor_photo(actor) cover_url = get_cover(html_content) outline, originalplot = get_outline(html_info) @@ -245,7 +258,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): 'runtime': runtime, 'score': '', 'series': '', - 'country': 'JP', + 'country': 'CN', 'director': director, 'studio': studio, 'publisher': publisher, @@ -306,7 +319,8 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): # print(main('H4610-ki230225')) # print(main('c0930-ki221218')) # print(main('c0930-hitozuma1407')) - print(main('h0930-ori1665')) + # print(main('h0930-ori1665')) + print(main('h0930-ori1665', appoint_url='https://7mm002.com/zh/amateur_content/107108/content.html')) # print(main('RBD-293')) # print(main('LUXU-728')) # 无结果 # print(main('fc2-1050737')) # 标题中有/