From 8d6cd7e634834a1b654a2206f57985d961c9e4d4 Mon Sep 17 00:00:00 2001 From: runoob Date: Sat, 13 Jan 2024 14:26:19 +0800 Subject: [PATCH 01/12] Fix: write custom actor names to nfo file --- src/models/core/nfo.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py index a0467e0..a272e4b 100644 --- a/src/models/core/nfo.py +++ b/src/models/core/nfo.py @@ -205,19 +205,26 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal if 'country,' in nfo_include_new: print(f" {country}", file=code) - # 输出演员 + # 输出男女演员 if 'actor_all,' in nfo_include_new: actor = all_actor - if actor and actor != '未知演员' and actor != '未知演員' and 'actor,' in nfo_include_new: + # 有演员时输出演员 + if 'actor,' in nfo_include_new and actor and actor not in ['未知演员','未知演員']: actor_list = actor.split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 - if actor_list: - for each in actor_list: - print(" ", file=code) - print(" " + each + "", file=code) - print(" Actor", file=code) - print(" ", file=code) - + # 无演员时输出演员 以文件命名设置中未知演员设置项为演员名,默认设置和空值不写入NFO + elif not actor_list and config.actor_no_name not in ["未知演员",'未知演員','']: + actor = config.actor_no_name + actor_list = actor.split(',') # 字符串转列表 + actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 + signal.add_log(f'⛑️ 无演员名, 使用手动设置项 🛠未知演员 写入NFO {config.actor_no_name}') + if actor_list: + for each in actor_list: + print(" ", file=code) + print(" " + each + "", file=code) + print(" Actor", file=code) + print(" ", file=code) + # 输出导演 if director and 'director,' in nfo_include_new: print(" " + director + "", file=code) From fc881333cc71eef653c7c48a964ca43e757bf8ae Mon Sep 17 00:00:00 2001 From: runoob Date: Sun, 14 Jan 2024 21:11:20 +0800 Subject: [PATCH 02/12] =?UTF-8?q?Fix:=20nfo=E6=96=87=E4=BB=B6=E5=86=99?= =?UTF-8?q?=E5=85=A5=E9=94=99=E8=AF=AF=E7=A9=BA=E6=A0=BC=EF=BC=9B=E6=89=8B?= =?UTF-8?q?=E5=8A=A8=E8=AE=BE=E7=BD=AE=E6=BC=94=E5=91=98=E5=86=99=E5=85=A5?= =?UTF-8?q?nfo=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/models/core/nfo.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py index a272e4b..3084b7a 100644 --- a/src/models/core/nfo.py +++ b/src/models/core/nfo.py @@ -81,8 +81,10 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal nfo_title = config.naming_media if not number: number = title + #默认emby视频标题配置为 [number title],国产重复时需去掉一个,去重需注意空格也应一起去掉,否则国产的nfo标题中会多一个空格 + #读取nfo title信息会去掉前面的number和空格以保留title展示出来,同时number和标题一致时,去掉number的逻辑变成去掉整个标题导致读取失败,见424行 if number == title and 'number' in nfo_title and 'title' in nfo_title: - nfo_title = nfo_title.replace('originaltitle', '').replace('title', '') + nfo_title = nfo_title.replace('originaltitle', '').replace('title', '').strip() first_letter = get_number_first_letter(number) # 处理演员 @@ -204,20 +206,22 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal # 输出国家 if 'country,' in nfo_include_new: print(f" {country}", file=code) - + + #初始化 actor_list + actor_list = [] # 输出男女演员 if 'actor_all,' in nfo_include_new: actor = all_actor # 有演员时输出演员 - if 'actor,' in nfo_include_new and actor and actor not in ['未知演员','未知演員']: + if 'actor,' in nfo_include_new and actor: actor_list = actor.split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 # 无演员时输出演员 以文件命名设置中未知演员设置项为演员名,默认设置和空值不写入NFO - elif not actor_list and config.actor_no_name not in ["未知演员",'未知演員','']: + elif 'actor,' in nfo_include_new and config.actor_no_name not in ["未知演员",'未知演員','']: actor = config.actor_no_name actor_list = actor.split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白 - signal.add_log(f'⛑️ 无演员名, 使用手动设置项 🛠未知演员 写入NFO {config.actor_no_name}') + signal.add_log(f'⛑️ 无演员名, 使用手动命名 写入NFO {config.actor_no_name}') if actor_list: for each in actor_list: print(" ", file=code) From e38989dce66e8b05b75b82ee04b2b7a7b63d03bb Mon Sep 17 00:00:00 2001 From: runoob Date: Sun, 14 Jan 2024 21:47:00 +0800 Subject: [PATCH 03/12] Fix: mmtv custom website;try more actor matching rules --- src/models/core/crawler.py | 2 +- src/models/crawlers/mmtv.py | 56 +++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py index 04a5360..76386b4 100644 --- a/src/models/core/crawler.py +++ b/src/models/core/crawler.py @@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai elif website == 'mgstage': json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number)) elif website == '7mmtv': - json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language)) + json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path)) elif website == 'fc2': json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language)) elif website == 'fc2hub': diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py index 0e7e143..f5cc6b1 100644 --- a/src/models/crawlers/mmtv.py +++ b/src/models/crawlers/mmtv.py @@ -9,6 +9,8 @@ from models.base.number import is_uncensored from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_actor_list, get_lable_list urllib3.disable_warnings() # yapf: disable @@ -30,15 +32,53 @@ def get_title(html, web_number): return result[0].replace(web_number, '').strip() if result else '' -def get_actor(html): +def get_actor(html, title, file_path): actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()') actor = '' if actor_list: for each in actor_list: '''愛澄玲花,日高ゆりあ(青山ひより) 菜津子 32歳 デザイナー''' actor += re.sub(r'(.+)', '', each).split(' ')[0] + ',' + else: + actor = get_some_info(title, file_path, info_type="actor") return actor.strip(',') +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info: + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info: + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info: + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) def get_real_url(html, number): result = html.xpath('//figure[@class="video-preview"]/a') @@ -134,7 +174,7 @@ def get_tag(html): def get_extrafanart(html): - result = html.xpath('//a[@class="screens-item fresco"]/@href') + result = html.xpath('//a[@class="lazyload screens-item fresco"]/@href') return result if result else '' @@ -166,7 +206,7 @@ def get_number(html, number): return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number -def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''): start_time = time.time() website_name = '7mmtv' req_web += '-> %s' % website_name @@ -175,9 +215,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): web_info = '\n ' log_info += ' \n 🌐 7mmtv' debug_info = '' + mmtv_url = 'https://www.7mmtv.sx' + if hasattr(config, '7mmtv_website'): + mmtv_url = getattr(config, '7mmtv_website') real_url = appoint_url # search_url = "https://bb9711.com/zh/searchform_search/all/index.html" - search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + # search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html" + search_url = f"{mmtv_url}/zh/searchform_search/all/index.html" mosaic = '' try: @@ -186,7 +230,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): if number.upper().startswith('FC2'): search_keyword = re.findall(r'\d{3,}', number)[0] - search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search' + search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search' debug_info = f'搜索地址: {search_url} ' log_info += web_info + debug_info result, response = curl_html(search_url) @@ -220,7 +264,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'): debug_info = '数据获取失败: 未获取到title!' log_info += web_info + debug_info raise Exception(debug_info) - actor = get_actor(html_info) + actor = get_actor(html_info, title, file_path) actor_photo = get_actor_photo(actor) cover_url = get_cover(html_content) outline, originalplot = get_outline(html_info) From 192c7e0a7ed899cc29eabb085e86d4b38935a660 Mon Sep 17 00:00:00 2001 From: runoob Date: Sun, 14 Jan 2024 21:51:43 +0800 Subject: [PATCH 04/12] Fix: not match unknown actor --- src/models/crawlers/mdtv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/models/crawlers/mdtv.py b/src/models/crawlers/mdtv.py index 94efea8..d7b3647 100644 --- a/src/models/crawlers/mdtv.py +++ b/src/models/crawlers/mdtv.py @@ -40,6 +40,8 @@ def get_some_info(html, title, file_path): # 未找到演员时,看热门演员是否在标题和各种信息里 series = series_list[0] if series_list else '' tag = ','.join(tag_list) + actor_fake_name = any ('未知' in item for item in actor_list) + actor_list = [] if actor_fake_name else actor_list if not actor_list: all_info = title + series + tag + file_path all_actor = get_actor_list() From cfa96c8d0df53bf89d93d520177bd384682a5e22 Mon Sep 17 00:00:00 2001 From: runoob Date: Tue, 16 Jan 2024 21:37:30 +0800 Subject: [PATCH 05/12] Fix: madouqu custom website;more actor matching attempts --- src/models/crawlers/madouqu.py | 46 +++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py index 33c2b92..4441843 100644 --- a/src/models/crawlers/madouqu.py +++ b/src/models/crawlers/madouqu.py @@ -9,12 +9,50 @@ from models.base.web import curl_html from models.crawlers.guochan import get_number_list +from models.config.config import config +from models.crawlers.guochan import get_actor_list, get_lable_list urllib3.disable_warnings() # yapf: disable # import traceback +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info: + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info: + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info: + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) def get_actor_photo(actor): actor = actor.split(',') @@ -25,7 +63,7 @@ def get_actor_photo(actor): return data -def get_detail_info(html, number): +def get_detail_info(html, number, file_path): detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//p//text()') # detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//text()') title_h1 = html.xpath('//div[@class="cao_entry_header"]/header/h1/text()') @@ -48,6 +86,7 @@ def get_detail_info(html, number): cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src') cover_url = cover_url[0] if cover_url else '' # print(number, title, actor, cover_url, studio, detail_info) + actor = get_some_info(title, file_path, info_type="actor") if actor == '' else actor return number, title, actor, cover_url, studio @@ -75,6 +114,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file log_info += ' \n 🌐 madouqu' debug_info = '' real_url = appoint_url + madouqu_url = getattr(config, 'madouqu_website', False) try: if not real_url: @@ -82,7 +122,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file number_list, filename_list = get_number_list(number, appoint_number, file_path) n_list = number_list[:1] + filename_list for each in n_list: - real_url = f'https://madouqu.com/?s={each}' + real_url = f'{madouqu_url}/?s={each}' if madouqu_url else f'https://madouqu.com/?s={each}' # real_url = 'https://madouqu.com/?s=XSJ-138.%E5%85%BB%E5%AD%90%E7%9A%84%E7%A7%98%E5%AF%86%E6%95%99%E5%AD%A6EP6' debug_info = f'请求地址: {real_url} ' log_info += web_info + debug_info @@ -111,7 +151,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file raise Exception(debug_info) detail_page = etree.fromstring(response, etree.HTMLParser()) - number, title, actor, cover_url, studio = get_detail_info(detail_page, number) + number, title, actor, cover_url, studio = get_detail_info(detail_page, number, file_path) actor_photo = get_actor_photo(actor) try: From 9c3b28e8e535e4dd6eed4c889a8d4ff495ed08bf Mon Sep 17 00:00:00 2001 From: runoob Date: Wed, 17 Jan 2024 14:23:26 +0800 Subject: [PATCH 06/12] Fix: non-javdb site writes irrelevant searchid field --- src/models/core/nfo.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py index 3084b7a..2143a86 100644 --- a/src/models/core/nfo.py +++ b/src/models/core/nfo.py @@ -329,10 +329,12 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal print(" " + website + "", file=code) # javdb id 输出, 没有时使用番号搜索页 - if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']: - print(" " + json_data_nfo["javdbid"] + "", file=code) - else: - print(" " + number + "", file=code) + if 'javdbid' in json_data_nfo: + # 其他非javdb网站取消强制输出该字段 + if json_data_nfo['javdbid']: + print(" " + json_data_nfo["javdbid"] + "", file=code) + else: + print(" " + number + "", file=code) print("", file=code) json_data['logs'] += "\n 🍀 Nfo done! (new)(%ss)" % get_used_time(start_time) return True From 0da944085757402bccc9177b7f9169673e07e309 Mon Sep 17 00:00:00 2001 From: runoob Date: Wed, 17 Jan 2024 16:25:33 +0800 Subject: [PATCH 07/12] Chore: guochan data collation --- src/models/crawlers/guochan.py | 259 +++++++-------------------------- 1 file changed, 52 insertions(+), 207 deletions(-) diff --git a/src/models/crawlers/guochan.py b/src/models/crawlers/guochan.py index c1c08bd..8fb739b 100644 --- a/src/models/crawlers/guochan.py +++ b/src/models/crawlers/guochan.py @@ -15,216 +15,61 @@ def get_lable_list(): - return ['传媒', '国产短视频', '国产精品', '国产AV', 'PsychoPorn色控', '叮叮映画', '涩会', '蜜桃影像传媒', - '大番号番啪啪', 'REAL野性派', '豚豚创媒', '宫美娱乐', '肉肉传媒', '爱妃传媒', '天美传媒', '皇家华人', - '91制片厂', '果冻传媒', 'O-STAR', '兔子先生', '杏吧原创', '杏吧独家', '辣椒原创', '麻豆传媒', '糖心', - '麻豆传媒映画', '红斯灯影像', '绝对领域', '麻麻传媒', '渡边传媒', 'AV帝王', '桃花源', '蝌蚪传媒', 'SWAG', - '麻豆', '杏吧'] + return ["麻豆传媒", "91茄子", "Ed Mosaic", "HongKongDoll", "JVID", "MINI传媒", "SA国际传媒", "TWAV", "乌鸦传媒", + "乐播传媒", "优蜜传媒", "偶蜜国际", "叮叮映画", "哔哩传媒", "大象传媒", "天美传媒", "开心鬼传媒", "微密圈", + "扣扣传媒", "抖阴传媒", "星空无限传媒", "映秀传媒", "杏吧传媒", "果冻传媒", "模密传媒", "爱污传媒", "爱神传媒", + "爱豆传媒", "狂点映像", "猛料原创", "猫爪影像", "皇家华人", "精东影业", "糖心VLOG", "维秘传媒", "草莓视频", "萝莉社", + "蜜桃传媒", "西瓜影视", "起点传媒", "香蕉视频", "PsychoPorn色控", "蜜桃影像传媒", "大番号番啪啪", "REAL野性派", "豚豚创媒", + "宫美娱乐", "肉肉传媒", "爱妃传媒", "91制片厂", "O-STAR","兔子先生", "杏吧原创", "杏吧独家", "辣椒原创", "麻豆传媒映画", "红斯灯影像", + "绝对领域", "麻麻传媒", "渡边传媒", "AV帝王", "桃花源", "蝌蚪传媒", "SWAG", "麻豆", "杏吧", "糖心", "国产短视频", "国产精品", "国产AV", "涩会"] def get_actor_list(): - return [ - '苏妲己', - '苏畅', - '宁洋子', - '沈芯语', - '艾秋', - '吴梦梦', - '尹媚', - '张芸熙', - '夏晴子', - '白佳萱', - '林思妤', - '沈娜娜', - '仙儿媛', - '许安妮', - '刘语珊', - '刘思慧', - '叶一涵', - '林亦涵', - '雪千夏', - '欧美玲', - '赵佳美', - '李慕儿', - '徐韵珊', - '苏娅', - '糖果屋', - '王茜', - '李婧琪', - '夏滢', - '顾伊梦', - '杜冰若', - '赵颖儿', - '秦可欣', - '莫安安', - '安娜', - '黎星若', - '仙儿', - '林予曦', - '蒋佑怡', - '许书曼', - '白晶晶', - '王有容', - '琳希', - '李恩琦', - '赵美凤', - '王欣', - '徐筱欣', - '黄雅曼', - '伊靖瑶', - '菲于娜', - '罗瑾萱', - '金丞熙', - '李文雯', - '苏清歌', - '付妙菱', - '钟丽琪', - '张娅庭', - '蜜苏', - '凌薇', - '叶凡舒', - '董小宛', - '程雨沫', - '瑶贝', - '郭瑶瑶', - '李嘉欣', - '辰悦', - '李曼妮', - '洛雪', - '千鹤', - '袁庭妮', - '林思好', - '张云熙', - '杜鹃', - '玛丽莲', - '李茹', - '何苗', - '黄雪纯', - '田恬', - '李琼', - '聂小倩', - '张晴', - '丁香', - '林凤娇', - '刘颖儿', - '杨思敏', - '李忆彤', - '伊蒂丝', - '绿帽先生', - '戚小怜', - '杨柳', - '唐茜', - '苏艾文', - '张曼青', - '斑斑', - '孟若羽', - '陈圆圆', - '雷梦娜', - '氖氖', - '仙儿', - '艾玛', - '蔚曼', - '静静', - '艾瑞卡', - '娜米', - '莉娜', - '乔安', - '林子涵', - '萱萱', - '糖糖', - '徐婕', - '王欣', - '白颖', - '吴芮瑜', - '韩棠', - '季妍希', - '沙耶香', - '七七', - '莉娜乔安', - '美雪樱', - '柚木结爱', - '黑田奈奈', - '王亦舒', - '张雅婷', - '李文静', - '肖泳', - '韩小雅', - '神山奈奈', - '白川麻衣', - '茜茜', - '夜夜', - '高梨遥香', - 'HongKongDoll', - '玩偶姐姐', - '蘇妲己', - '蘇暢', - '寧洋子', - '沈芯語', - '吳夢夢', - '張芸熙', - '仙兒媛', - '許安妮', - '劉語珊', - '劉思慧', - '葉一涵', - '歐美玲', - '趙佳美', - '李慕兒', - '徐韻珊', - '蘇婭', - '夏瀅', - '顧伊夢', - '趙穎兒', - '仙兒', - '蔣佑怡', - '許書曼', - '趙美鳳', - '黃雅曼', - '伊靖瑤', - '羅瑾萱', - '蘇清歌', - '鍾麗琪', - '張婭庭', - '蜜蘇', - '葉凡舒', - '瑤貝', - '郭瑤瑤', - '辰悅', - '千鶴', - '張雲熙', - '杜鵑', - '瑪麗蓮', - '黃雪純', - '李瓊', - '聶小倩', - '張晴', - '林鳳嬌', - '劉穎兒', - '楊思敏', - '李憶彤', - '伊蒂絲', - '綠帽先生', - '戚小憐', - '楊柳', - '蘇艾文', - '張曼青', - '陳圓圓', - '雷夢娜', - '仙兒', - '艾瑪', - '靜靜', - '喬安', - '白穎', - '吳芮瑜', - '韓棠', - '莉娜喬安', - '美雪櫻', - '柚木結愛', - '張雅婷', - '李文靜', - '韓小雅', - '高梨遙香', - ] + return ['Madison Summers', 'Spencer Bradley', 'Madison Morgan', 'Rosalyn Sphinx', 'Braylin Bailey', 'Whitney Wright', 'Victoria Voxxx', 'Alexia Anders', + 'Bella Rolland', 'Violet Myers', 'Sophia Leone', 'Violet Starr', 'Eliza Ibarra', 'HongKongDoll', 'Keira Croft', 'April Olsen', 'Avery Black', + 'Amber Moore', 'Anny Aurora', 'Skylar Snow', 'Harley Haze', 'Paige Owens', 'Vanessa Sky', 'MasukuChan', 'Kate Bloom', 'Kimmy Kimm', 'Ana Foxxx', + 'Lexi Luna', 'Gia Derza', 'Skye Blue', 'Nico Love', 'Alyx Star', 'Ryan Reid', 'Kira Noir', 'Karma Rx', '下面有根棒棒糖', 'Vivian姐', 'COLA酱', + 'cola醬', 'Stacy', 'ROXIE', '真木今日子', '小七软同学', 'Chloe', 'Alona', '小日向可怜', 'NANA', '玩偶姐姐', '粉色情人', '桥本香菜', '冉冉学姐', '小二先生', + '饼干姐姐', 'Rona', '不见星空', '米娜学姐', '阿蛇姐姐', '樱花小猫', '樱井美里', '宸荨樱桃', '樱空桃桃', '牛奶泡芙', '91兔兔', '棉花糖糖', '桥本爱菜', + '许木学长', 'MOMO', '驯鹿女孩', '高梨遥香', 'DORY', '冬月结衣', 'Aida', '香菜公主', '藤田美绪', '浅尾美羽', '天音美纱', '中条爱莉', '三月樱花', 'Emma', + 'Vita', '千夜喵喵', '水原圣子', '白川麻衣', '池田奈美', '西村莉娜', 'A天使爱', '中野惠子', '麻衣CC', '樱桃空空', 'LENA', '小泽纱香', '木下日葵', '中岛芳子', + '弥生美月', '逢见梨花', '宇佐爱花', '沙月芽衣', '羽月萌音', '前田由美', '伊东爱瑠', 'Misa', '绿帽先生', '莉娜乔安', '柚木结爱', '黑田奈奈', '神山奈奈', + '孟若羽', '夏晴子', '吴梦梦', '沈娜娜', '李蓉蓉', '林思妤', '仙儿媛', '金宝娜', '季妍希', '温芮欣', '吴文淇', '苏语棠', '秦可欣', '吴芳宜', '李娜娜', + '乐奈子', '宋南伊', '小水水', '白允儿', '管明美', '雪千夏', '苏清歌', '玥可岚', '梁芸菲', '白熙雨', '小敏儿', '楚梦舒', '柚子猫', '姚宛儿', '宋雨川', + '舒可芯', '苏念瑾', '白沛瑶', '林沁儿', '唐雨菲', '李允熙', '张芸熙', '寻小小', '白靖寒', '钟宛冰', '李薇薇', '米菲兔', '雷梦娜', '董悦悦', '袁子仪', + '赖畇希', '王以欣', '夏禹熙', '狐不妖', '凌波丽', '黎芷萱', '陆斑比', '辛尤里', '小猫咪', '顾桃桃', '南芊允', '岚湘庭', '林芊彤', '梁佳芯', '林凤娇', + '明日香', '绫波丽', '邓紫晴', '赵一曼', '吴茜茜', '锅锅酱', '倪哇哇', '潘雨曦', '吴恺彤', '美杜莎', '郭童童', '陈可心', '莫夕慈', '沈芯语', '董小宛', + '苏艾文', '翁雨澄', '赵晓涵', '小桃酱', '宋东琳', '香月怜', '李文雯', '白若冰', '徐夜夜', '真希波', '爱丽丝', '张宇芯', '金善雅', '李依依', '苏安亚', + '奶咪酱', '白葵司', '罗瑾萱', '宁洋子', '小夜夜', '白晶晶', '张雅婷', '吴心语', '林曼芸', '项子甯', '吳芳宜', '苏小小', '文冰冰', '韩宝儿', '白星雨', + '林怡梦', '张欣妍', '七濑恋', '白思吟', '吴凯彤', '溫芮欣', '林可菲', '黎芷媗', '御梦子', '苏雨彤', '古伊娜', '聂小倩', '陈圆圆', '沙美辰', '林妙可', + '乐淆雪', '李恩娜', '周晴晴', '杨思敏', '李曼妮', '戚小怜', '谢语彤', '王筱璐', '卢珊珊', '程诗诗', '林玥玥', '白瞳瞳', '魏乔安', '米胡桃', '施子涵', + '北野爱', '杜冰若', '玛丽莲', '胡蓉蓉', '万静雪', '花语柔', '萧悦儿', '林晓雪', '兰心洁', '神谷怜', '唐雨霏', '鸡蛋饼', '沈湘妮', '费爵娜', '小美惠', + '大奶露', '向若云', '苏小沫', '榨汁姬', '陈星然', '夏雨荷', '姚彤彤', '莫云雪', '艾瑞卡', '黄雪纯', '赵雅琳', '叶宸欣', '伊琬琳', '陈美惠', '金巧巧', + '陈美琳', '陆思涵', '顾小北', '陈小雨', '维里娜', '兔小白', '叶子红', '美凉子', '李丹彤', '李微微', '白婷婷', '艾米酱', '刘小姗', '白童童', '张琪琪', + 'Yua', '小玩子', '岚可彤', '都可可', '李慕儿', '叶一涵', '赵佳美', '董小婉', '钟丽琪', '韩小雅', '杨朵儿', '叶梦语', '程雨沫', '张曼青', '纪妍希', '伊婉琳', + '凌雨萱', '潘甜甜', '美竹玲', '韩依人', '奈奈子', '林雪漫', '宋甜甜', '陆雪琪', '宋妮可', '陆子欣', '范可可', '许依然', '苏小新', '蒋梦琳', '李可欣', + '小鹿酱', '小林杏', '陶杏儿', '明步奈', '苏宁儿', '白潼潼', '增田枫', '特污兔', '何安汝', '倪菀儿', '唐可可', '口罩酱', '小千绪', '糖糖儿', '许安妮', + '李婧琪', '刘思慧', '欧阳晶', '欧美玲', '林亦涵', '钟以彤', '许书曼', '付妙菱', '伊靖瑶', '张娅庭', '韩小野', '宫泽蓝', '冯思雨', '林小樱', '刘颖儿', + '莫潇潇', '胡心瑶', '林雨露', '苏婧薇', '许月珍', '陈若瑶', '吴芮瑜', '叶如梦', '刘依依', '吴语菲', '张妮妮', '林子涵', '张子瑜', '周卿卿', '李师师', + '苏语堂', '方紫璐', '袁采菱', '刘清韵', '李曼丽', '刘小雯', '姬咲华', '高小颜', '蔡晓雨', '梁如意', '林语桐', '王小妮', '唐月琴', '星谷瞳', '何小丽', + '张婉妍', '酒井爱', '张秀玲', '晚晚酱', '薛梦琪', '李乐乐', '张佳晨', '程媛媛', '沐语柔', '安琪拉', '韩倪希', '苏妲己', '白佳萱', '刘语珊', '徐韵珊', + '糖果屋', '顾伊梦', '赵颖儿', '莫安安', '黎星若', '林予曦', '蒋佑怡', '王有容', '李恩琦', '赵美凤', '徐筱欣', '黄雅曼', '菲于娜', '金丞熙', '叶凡舒', + '郭瑶瑶', '李嘉欣', '袁庭妮', '林思好', '张云熙', '李忆彤', '伊蒂丝', '沙耶香', '美雪樱', '王亦舒', '李文静', '鸡教练', '斑斑', '坏坏', '糖糖', '艾秋', + '凌薇', '莉娜', '韩棠', '苡若', '尤莉', '优娜', '林嫣', '徐蕾', '周甯', '唐茜', '香菱', '佳芯', '湘湘', '米欧', '斑比', '蜜苏', '小婕', '艾熙', '娃娃', + '艾玛', '雪霏', '夜夜', '欣欣', '乔安', '羽芮', '美酱', '师师', '玖玖', '橙子', '晨曦', '苏娅', '黎儿', '晨晨', '嘉洛', '小遥', '苏畅', '琪琪', '苡琍', + '李慕', '心萱', '昀希', '黎娜', '乐乐', '樱桃', '桐桐', '苏璇', '安娜', '悠悠', '茉莉', '宛冰', '静静', '丝丝', '菲菲', '樱樱', '波妮', '唐芯', '小野', + '何苗', '甜心', '瑶瑶', '小捷', '薇薇', '美樱', '宁静', '欧妮', '吉吉', '小桃', '绯丽', '嘉琪', '咪妮', '雯茜', '小洁', '李琼', '唐霏', '岚玥', '熙熙', + '米娅', '舒舒', '斯斯', '欣怡', '妍儿', '阿雅', '宋可', '畇希', '柔伊', '雅沁', '惠敏', '露露', '艾悠', '娜娜', '李娜', '肖云', '王玥', '林洋', '清洛', + '艾鲤', '依涵', '半雪', '琦琦', '莎莎', '小冉', '琳怡', '莉奈', '梅子', '啤儿', '瑶贝', '杨柳', '童汐', '米亚', '琳达', '晴天', 'KK', '紫宸', '淑怡', + '花花', '金铭', '程葳', '妍希', '咪妃', '茜茜', '小蜜', '凌萱', '觅嫣', '涵涵', '欲梦', '美琳', '杜鹃', '许诺', '兮兮', '白鹿', '虞姬', '丽萨', '蔷薇', + '小影', '优优', '茶茶', '可儿', '甜甜', '憨憨', '波尼', '依颂', '依依', '思思', '芳情', '月牙', '小爱', '淳儿', '苗方', '茶理', '苹果', '苏然', '陶子', + '董欣', '羽熙', '清沐', '林襄', '娃诺', '洁咪', '小昭', '球球', '紫萱', '南兰', '安琪', '可乐', '夏露', '诗琪', '陈韵', '丽娜', '苏旋', '月月', '石榴', + '米兰', '恩恩', '西子', '芷萱', '酥酥', '王茜', '千鹤', '雪见', '姜洁', '张晴', '辰悦', '丁香', '白颖', '穆娜', '小芳', '吉娜', '秋霞', '无双', '夏宝', + '舒涵', '小柔', '小小', '璇元', '米砂', '余丽', '美嘉', '莉莉', '奈奈', '黑糖', '晴子', '多乙', '徐婕', '闵闵', '小雪', '洋洋', '明儿', '苏茜', '芯怡', + '姚茜', '百合', '婉婷', '小乔', '芽芽', '婕珍', '乔乔', '紫寒', '小薇', '菜菜', '洁米', '夏天', '灵枝', '语伊', '徐艳', '王佩', '希汶', '雅捷', '喵喵', + '尤奈', '仙儿', '氖氖', '蔚曼', '田恬', '颂潮', '小婵', '千凌', '李燕', '林芳', '杨桃', '艾莉', '落落', '冯雪', '王蓉', '妖妖', '雨晨', '心雪', '穆雪', + '韩焉', '邱月', '檀雅', '柯柯', '七七', '鱼儿', '丹丹', '简一', '淑仪', '小哇', '朵儿', '妲己', '云朵', '唐菲', '邦妮', '白英', '夏夏', '安安', '小艺', + '丽丽', '敏敏', '空空', '椿芽', '小言', '李蕊', '水水', '小鱼', '艾艾', '尹媚', '夏滢', '琳希', '王欣', '洛雪', '李茹', '娜米', '萱萱', '肖泳'] def get_number_list(number, appoint_number='', file_path=''): # 处理国产番号 From 96f68ef19280a65ad38a1306c2dabe8cf390d014 Mon Sep 17 00:00:00 2001 From: runoob Date: Sat, 20 Jan 2024 22:05:33 +0800 Subject: [PATCH 08/12] Fix: 7mmtv get more extrafanart pics --- src/models/crawlers/mmtv.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py index f5cc6b1..bd51592 100644 --- a/src/models/crawlers/mmtv.py +++ b/src/models/crawlers/mmtv.py @@ -174,7 +174,12 @@ def get_tag(html): def get_extrafanart(html): - result = html.xpath('//a[@class="lazyload screens-item fresco"]/@href') + # 前几张 + result1 = html.xpath('//span/img[contains(@class, "lazyload")]/@data-src') + # 其他隐藏需点击的 + if result2 := html.xpath('//div[contains(@class, "fullvideo")]/script[@language="javascript"]/text()'): + result2 = re.findall(r'https?://.+?\.jpe?g', str(result2)) + result = result1 + result2 return result if result else '' @@ -289,7 +294,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file 'runtime': runtime, 'score': '', 'series': '', - 'country': 'JP', + 'country': 'CN', 'director': director, 'studio': studio, 'publisher': publisher, @@ -350,7 +355,8 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file # print(main('H4610-ki230225')) # print(main('c0930-ki221218')) # print(main('c0930-hitozuma1407')) - print(main('h0930-ori1665')) + #print(main('h0930-ori1665')) + print(main('h0930-ori1665', appoint_url='https://7mm002.com/zh/amateur_content/107108/content.html')) # print(main('RBD-293')) # print(main('LUXU-728')) # 无结果 # print(main('fc2-1050737')) # 标题中有/ From b97dd3bcfd7a955f47dc499d57d5630a20302047 Mon Sep 17 00:00:00 2001 From: runoob Date: Sat, 20 Jan 2024 22:09:09 +0800 Subject: [PATCH 09/12] Fix: madouqu subtle parameter adjustment --- src/models/crawlers/madouqu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py index 4441843..6ee0863 100644 --- a/src/models/crawlers/madouqu.py +++ b/src/models/crawlers/madouqu.py @@ -26,7 +26,7 @@ def get_some_info(title, file_path, info_type, tag='', actor='', series=''): tag_list = [] all_tag = get_lable_list() for each in all_tag: - if each in all_info: + if each in all_info.upper(): tag_list.append(each) new_tag_list = [] [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] @@ -37,7 +37,7 @@ def get_some_info(title, file_path, info_type, tag='', actor='', series=''): actor_list = [] all_actor = get_actor_list() for each in all_actor: - if each in all_info: + if each in all_info.upper(): actor_list.append(each) new_actor_list = [] [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] @@ -48,7 +48,7 @@ def get_some_info(title, file_path, info_type, tag='', actor='', series=''): series_list = [] all_series = get_lable_list() for each in all_series: - if each in all_info: + if each in all_info.upper(): series_list.append(each) new_series_list = [] [new_series_list.append(i) for i in series_list if i and i not in new_series_list] @@ -94,7 +94,8 @@ def get_real_url(html, number_list): item_list = html.xpath('//div[@class="entry-media"]/div/a') for each in item_list: detail_url = each.get('href') - title = each.xpath('img[@class="lazyload"]/@alt')[0] + # lazyload属性容易改变,去掉也能拿到结果 + title = each.xpath('img[@class]/@alt')[0] if title and detail_url: for n in number_list: temp_n = re.sub(r'[\W_]', '', n).upper() From bada9e69e4c9ef470757cc880876a2e346bc64e0 Mon Sep 17 00:00:00 2001 From: runoob Date: Sat, 20 Jan 2024 22:19:10 +0800 Subject: [PATCH 10/12] Fix: guochan crawlers remove useless characters; number recognition adjustment --- src/models/crawlers/guochan.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/models/crawlers/guochan.py b/src/models/crawlers/guochan.py index 8fb739b..6dd1d38 100644 --- a/src/models/crawlers/guochan.py +++ b/src/models/crawlers/guochan.py @@ -6,6 +6,7 @@ import urllib3 import zhconv +from models.base.number import remove_escape_string urllib3.disable_warnings() # yapf: disable @@ -73,6 +74,11 @@ def get_actor_list(): def get_number_list(number, appoint_number='', file_path=''): # 处理国产番号 + + # 国产匹配番号或标题前也可以先排除路径中多余字符 + if file_path: + file_path = remove_escape_string(file_path) + file_name = os.path.splitext(os.path.split(file_path)[1])[0].upper() if file_path else '' number = number.upper() number_list = [] # 返回一个番号列表,用来搜索 @@ -202,7 +208,11 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番 number_list.extend([number_normal_4, number_has_nothing_4, number_has_space_4]) if len(number_list): break + # 番号识别将纯数字和字母放在最前面(将长度最短的放前面即可),刮削网站一般也只取 number_list 第一项进行搜索,其他用于搜索结果页比对 + sorted_number_list = sorted(number_list, key=lambda x: len(x)) + + # 以下处理没有番号的作品 # 台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品 # PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画 # 國産麻豆AV 麻豆番外 大番號女優空降上海 特別篇 沈芯語 @@ -248,13 +258,23 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番 # 把文件名加到列表 filename_list.append(real_file_name) + # 演员后面的第一句成功刮削概率较高,插入列表第一项 + # 超级丝袜控180大长腿女神▌苹果▌我的室友 第八篇 黑丝女仆骚丁小穴湿淋淋 肉棒塞满激怼爆射 + # 17205-最新极品天花板小萝莉▌粉色情人▌摄影师的威胁 粗屌爆艹少女白虎嫩鲍 极速刮擦蜜壶淫靡下体 + # 潮喷淫娃御姐〖小水水〗和异地大奶女友开房,激情互舔口爆高潮喷水,黑丝美腿女神极度淫骚 潮喷不停 + # 极品爆乳鲜嫩美穴貌美尤物▌苏美奈▌家政女仆的肉体服务 肏到羞耻喷汁 极射中出鲜嫩美穴 + # 【小酒改头换面】,罕见大胸嫩妹,小伙今夜捡到宝了 + if u := re.search(r'(【.+】|▌.+▌|〖.+〗|『.+』)[,,\- ]?(\S{6,18}?)[,,\- ]', real_file_name): + search_char = u.group(2) + filename_list.insert(0, search_char) + # 转繁体 filename_list.append(zhconv.convert(filename_list[0], 'zh-hant')) # 去重去空 new_number_list = [] new_filename_list = [] - [new_number_list.append(i) for i in number_list if i and i not in new_number_list] + [new_number_list.append(i) for i in sorted_number_list if i and i not in new_number_list] [new_filename_list.append(i) for i in filename_list if i and i not in new_filename_list] return new_number_list, new_filename_list From edd03b7282106ae2e0eac9365f1cb4ae3a6fac76 Mon Sep 17 00:00:00 2001 From: runoob Date: Tue, 30 Jan 2024 21:29:21 +0800 Subject: [PATCH 11/12] Feat: add hscangku and cableav crawlers --- src/controllers/main_window/main_window.py | 2 + src/models/config/config_manual.py | 4 + src/models/core/crawler.py | 8 +- src/models/crawlers/cableav.py | 281 +++++++++++++++++++ src/models/crawlers/hscangku.py | 300 +++++++++++++++++++++ 5 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 src/models/crawlers/cableav.py create mode 100644 src/models/crawlers/hscangku.py diff --git a/src/controllers/main_window/main_window.py b/src/controllers/main_window/main_window.py index b471cf3..3b3676b 100644 --- a/src/controllers/main_window/main_window.py +++ b/src/controllers/main_window/main_window.py @@ -2061,6 +2061,8 @@ def _netResult(self): 'mdtv': ['https://www.mdpjzip.xyz', ''], 'madouqu': ['https://madouqu.com', ''], 'cnmdb': ['https://cnmdb.net', ''], + 'hscangku': ['https://hscangku.net', ''], + 'cableav': ['https://cableav.tv', ''], 'lulubar': ['https://lulubar.co', ''], 'love6': ['https://love6.tv', ''], 'yesjav': ['http://www.yesjav.info', ''], diff --git a/src/models/config/config_manual.py b/src/models/config/config_manual.py index e97a984..5c59e21 100644 --- a/src/models/config/config_manual.py +++ b/src/models/config/config_manual.py @@ -67,6 +67,8 @@ class ManualConfig: 'lulubar', 'madouqu', 'mdtv', + 'hscangku', + 'cableav', 'mgstage', 'mywife', 'prestige', @@ -513,6 +515,8 @@ class ManualConfig: 'mdtv': 'mdtv', 'mdpjzip': 'mdtv', 'madouqu': 'madouqu', + 'hsck': 'hscangku', + 'cableav': 'cableav', 'mgstage': 'mgstage', '7mmtv': '7mmtv', 'bb9711': '7mmtv', diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py index 76386b4..49376c2 100644 --- a/src/models/core/crawler.py +++ b/src/models/core/crawler.py @@ -11,7 +11,7 @@ from models.core.flags import Flags from models.crawlers import airav_cc_new, airav_new, avsex, avsox, cnmdb, dahlia, dmm, faleno, fantastica, fc2, fc2club, \ fc2hub, freejavbt, getchu, getchu_dmm, giga, hdouban, iqqtv_new, jav321, javbus, javdb, javlibrary_new, kin8, love6, \ - lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity + lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity, hscangku, cableav from models.entity.enums import FileMode @@ -137,6 +137,12 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai elif website == 'madouqu': json_data = json.loads( madouqu.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) + elif website == 'hscangku': + json_data = json.loads( + hscangku.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) + elif website == 'cableav': + json_data = json.loads( + cableav.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number)) elif website == 'getchu': json_data = json.loads(getchu.main(file_number, appoint_url, log_info, req_web, language)) elif website == 'getchu_dmm': diff --git a/src/models/crawlers/cableav.py b/src/models/crawlers/cableav.py new file mode 100644 index 0000000..52b8164 --- /dev/null +++ b/src/models/crawlers/cableav.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import re +import time + +import urllib3 +import zhconv +from lxml import etree + +from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_number_list +from models.crawlers.guochan import get_actor_list, get_lable_list + +urllib3.disable_warnings() # yapf: disable + +# import traceback + +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info.upper(): + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info.upper(): + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info.upper(): + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) + +def get_actor_photo(actor): + actor = actor.split(',') + data = {} + for i in actor: + actor_photo = {i: ''} + data.update(actor_photo) + return data + + +def get_detail_info(html, number, file_path): + title_h1 = html.xpath('//div[@class="entry-content "]/p/text()') + title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number + actor = get_some_info(title, file_path, info_type="actor") + tmp_tag = html.xpath('//header//div[@class="categories-wrap"]/a/text()') + # 标签转简体 + tag = zhconv.convert(tmp_tag[0], 'zh-cn') if tmp_tag else '' + cover_url = html.xpath(f'//meta[@property="og:image"]/@content') + cover_url = cover_url[0] if cover_url else '' + + return number, title, actor, cover_url, tag + + +def get_real_url(html, number_list): + item_list = html.xpath('//h3[contains(@class,"title")]//a[@href and @title]') + for each in item_list: + #href="https://cableav.tv/Xq1Sg3SvZPk/" + detail_url = each.get('href') + title = each.xpath('text()')[0] + if title and detail_url: + for n in number_list: + temp_n = re.sub(r'[\W_]', '', n).upper() + temp_title = re.sub(r'[\W_]', '', title).upper() + if temp_n in temp_title: + return True, n, title, detail_url + return False, '', '', '' + + +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''): + start_time = time.time() + website_name = 'cableav' + req_web += '-> %s' % website_name + title = '' + cover_url = '' + web_info = '\n ' + log_info += ' \n 🌐 cableav' + debug_info = '' + real_url = appoint_url + cableav_url = getattr(config, 'cableav_website', 'https://cableav.tv') + + try: + if not real_url: + # 处理番号 + number_list, filename_list = get_number_list(number, appoint_number, file_path) + n_list = number_list[:1] + filename_list + for each in n_list: + real_url = f'{cableav_url}/?s={each}' + # real_url = 'https://cableav.tv/s?s=%E6%9F%9A%E5%AD%90%E7%8C%AB' + debug_info = f'请求地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + if not result: + debug_info = '网络请求错误: %s' % response + log_info += web_info + debug_info + raise Exception(debug_info) + search_page = etree.fromstring(response, etree.HTMLParser()) + result, number, title, real_url = get_real_url(search_page, n_list) + # real_url = 'https://cableav.tv/hyfaqwfjhio' + if result: + break + else: + debug_info = '没有匹配的搜索结果' + log_info += web_info + debug_info + raise Exception(debug_info) + + debug_info = f'番号地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '没有找到数据 %s ' % response + log_info += web_info + debug_info + raise Exception(debug_info) + + detail_page = etree.fromstring(response, etree.HTMLParser()) + number, title, actor, cover_url, tag = get_detail_info(detail_page, number, file_path) + actor_photo = get_actor_photo(actor) + + try: + dic = { + 'number': number, + 'title': title, + 'originaltitle': title, + 'actor': actor, + 'outline': '', + 'originalplot': '', + 'tag': tag, + 'release': '', + 'year': '', + 'runtime': '', + 'score': '', + 'series': '', + 'country': 'CN', + 'director': '', + 'studio': '', + 'publisher': '', + 'source': 'cableav', + 'website': real_url, + 'actor_photo': actor_photo, + 'cover': cover_url, + 'poster': '', + 'extrafanart': '', + 'trailer': '', + 'image_download': False, + 'image_cut': 'no', + 'log_info': log_info, + 'error_info': '', + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + 'mosaic': '国产', + 'wanted': '', + } + debug_info = '数据获取成功!' + log_info += web_info + debug_info + dic['log_info'] = log_info + except Exception as e: + debug_info = '数据生成出错: %s' % str(e) + log_info += web_info + debug_info + raise Exception(debug_info) + + except Exception as e: + # print(traceback.format_exc()) + debug_info = str(e) + dic = { + 'title': '', + 'cover': '', + 'website': '', + 'log_info': log_info, + 'error_info': debug_info, + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + } + dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}} + js = json.dumps( + dic, + ensure_ascii=False, + sort_keys=False, + indent=4, + separators=(',', ': '), + ) + return js + + +if __name__ == '__main__': + # yapf: disable + # print(main('GDCM-018')) + # print(main('国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度', file_path='国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度')) + # print(main('RS001', file_path='RS-001.红斯灯影像.REDSTEN.淫白大胜利.上.男女水中竞赛.败方被强制插入高潮连连')) + # print(main('MD-0269', file_path='MD-0269.梁佳芯.唐芯.换妻性爱淫元宵.正月十五操骚鲍.麻豆传媒映画原创中文原版收藏')) + # print(main('sh-006', file_path='SH-006.谢冰岚.神屌侠侣.是谁操了我的小龙女.涩会传媒')) + # print(main('PMC-085', file_path='PMC/PMC-085.雪霏.出差借宿小姨子乱伦姐夫.特别照顾的肉体答谢.蜜桃影像传媒.ts')) + # print(main('TM-0165', file_path='TM0165.王小妮.妈妈的性奴之路.性感少妇被儿子和同学调教成性奴.天美传媒')) + # print(main('mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒')) + # print(main('mini06', file_path='mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒')) + # print(main('mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒', file_path='mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒')) + # print(main('XSJ138', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) + # print(main('DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌', file_path='DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌')) + # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作')) + # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列')) + # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) + print(main('真实记录反差', file_path='真实记录反差')) + # print(main('MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样', file_path='MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样')) + # print(main('MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版', file_path='MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版')) + # print(main('MSD-023', file_path='MSD023.袁子仪.杨柳.可爱女孩非亲妹.渴望已久的(非)近亲性爱.麻豆传媒映画.Model.Seeding系列.mp4')) + # print(main('', file_path='夏日回忆 贰')) + # print(main('MDX-0016')) + # print(main('MDSJ-0004')) + # print(main('RS-020')) + # print(main('PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒', file_path='PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒')) + # print(main('老公在外出差家里的娇妻被入室小偷强迫性交 - 美酱')) + # print(main('', file_path='夏日回忆 贰 HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4')) + # print(main('', file_path='HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4')) + # print(main('', file_path="【HongKongDoll玩偶姐姐.短篇集.情人节特辑.Valentine's Day Special-cd2")) + # print(main('', file_path='PMC-062 唐茜.綠帽丈夫連同新弟怒操出軌老婆.強拍淫蕩老婆被操 唐茜.ts')) + # print(main('', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画')) + # print(main('淫欲游戏王.EP6', appoint_number='淫欲游戏王.EP5', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) # EP不带.才能搜到 + # print(main('', file_path='PMS-003.职场冰与火.EP3设局.宁静.苏文文.设局我要女人都臣服在我胯下.蜜桃影像传媒')) + # print(main('', file_path='PMS-001 性爱公寓EP04 仨人.蜜桃影像传媒.ts')) + # print(main('', file_path='PMS-001.性爱公寓EP03.ts')) + # print(main('', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.ts')) + # print(main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) + # main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts') + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱')) # 简体搜不到 + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木结爱.TS')) + # '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛', '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-', ' 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛'] + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱.TS')) + # print(main('', file_path='PMS-001 性爱公寓EP02 女王 蜜桃影像传媒 -莉娜乔安.TS')) + # print(main('91CM-081', file_path='91CM-081.田恬.李琼.继母与女儿.三.爸爸不在家先上妹妹再玩弄母亲.果冻传媒.mp4')) + # print(main('91CM-081', file_path='MDJ-0001.EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.麻豆传媒映画.mp4')) + # print(main('91CM-081', file_path='MDJ0001 EP2 AV 淫兽鬼父 陈美惠 .TS')) + # print(main('91CM-081', file_path='MXJ-0005.EP1.弥生美月.小恶魔高校生.与老师共度的放浪补课.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='PH-US-002.色控.音乐老师全裸诱惑.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.TS')) + # print(main('91CM-081', file_path='MD-0140-2.蜜苏.家有性事EP2.爱在身边.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDUS系列[中文字幕].LAX0025.性感尤物渴望激情猛操.RUCK ME LIKE A SEX DOLL.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='REAL野性派001-朋友的女友讓我最上火.TS')) + # print(main('91CM-081', file_path='MDS-009.张芸熙.巨乳旗袍诱惑.搔首弄姿色气满点.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDS005 被雇主强上的熟女家政妇 大声呻吟被操到高潮 杜冰若.mp4.TS')) + # print(main('91CM-081', file_path='TT-005.孟若羽.F罩杯性感巨乳DJ.麻豆出品x宫美娱乐.TS')) + # print(main('91CM-081', file_path='台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品.TS')) + # print(main('91CM-081', file_path='PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='鲍鱼游戏SquirtGame.吸舔碰糖.失败者屈辱凌辱.TS')) + # print(main('91CM-081', file_path='导演系列 外卖员的色情体验 麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDS007 骚逼女友在作妖-硬上男友当玩具 叶一涵.TS')) + # print(main('MDM-002')) # 去掉标题最后的发行商 + # print(main('MDS-007')) # 数字要四位才能搜索到,即 MDS-0007 MDJ001 EP1 我的女优物语陈美惠.TS + # print(main('MDS-007', file_path='MDJ001 EP1 我的女优物语陈美惠.TS')) # 数字要四位才能搜索到,即 MDJ-0001.EP1 + # print(main('91CM-090')) # 带横线才能搜到 + # print(main('台湾SWAG chloebabe 剩蛋特辑 干爆小鹿')) # 带空格才能搜到 + # print(main('淫欲游戏王EP2')) # 不带空格才能搜到 + # print(main('台湾SWAG-chloebabe-剩蛋特輯-幹爆小鹿')) + # print(main('MD-0020')) + # print(main('mds009')) + # print(main('mds02209')) + # print(main('女王的SM调教')) + # print(main('91CM202')) + # print(main('91CM-202')) diff --git a/src/models/crawlers/hscangku.py b/src/models/crawlers/hscangku.py new file mode 100644 index 0000000..20592b9 --- /dev/null +++ b/src/models/crawlers/hscangku.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import re +import time + +import urllib3 +import zhconv +from lxml import etree + +from models.base.web import curl_html +from models.config.config import config +from models.crawlers.guochan import get_number_list +from models.crawlers.guochan import get_actor_list, get_lable_list + +urllib3.disable_warnings() # yapf: disable + +# import traceback + +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info.upper(): + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info.upper(): + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info.upper(): + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) + +def get_actor_photo(actor): + actor = actor.split(',') + data = {} + for i in actor: + actor_photo = {i: ''} + data.update(actor_photo) + return data + + +def get_detail_info(html, real_url, number, file_path): + href = re.split(r'[/.]', real_url)[-2] + title_h1 = html.xpath('//h3[@class="title" and not(contains(normalize-space(.), "目录")) and not(contains(normalize-space(.), "为你推荐"))]/text()') + title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number + actor = get_some_info(title, file_path, info_type="actor") + tag = get_some_info(title, file_path, info_type="tag") + cover_url = html.xpath(f'//a[@data-original and contains(@href,"{href}")]/@data-original') + cover_url = cover_url[0] if cover_url else '' + + return number, title, actor, cover_url, tag + + +def get_real_url(html, number_list, hscangku_url): + item_list = html.xpath('//a[@class="stui-vodlist__thumb lazyload"]') + for each in item_list: + #href="/vodplay/41998-1-1.html" + detail_url = hscangku_url + each.get('href') + title = each.xpath('@title')[0] + if title and detail_url: + for n in number_list: + temp_n = re.sub(r'[\W_]', '', n).upper() + temp_title = re.sub(r'[\W_]', '', title).upper() + if temp_n in temp_title: + return True, n, title, detail_url + return False, '', '', '' + +def get_redirected_url(url): + + result, response = curl_html(url) + if not result: + return None + + if redirected_url := re.search(r'"(https?://.*?)"', response).group(1): + http = urllib3.PoolManager() + response = http.request('GET', f'{redirected_url}{url}&p=', redirect=False) + final_url = response.get_redirect_location() + return final_url if final_url else None + else: + return None + +def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''): + start_time = time.time() + website_name = 'hscangku' + req_web += '-> %s' % website_name + title = '' + cover_url = '' + web_info = '\n ' + log_info += ' \n 🌐 hscangku' + debug_info = '' + real_url = appoint_url + hscangku_url = getattr(config, 'hscangku_website', 'http://hsck.net') + + try: + if not real_url: + # 处理番号 + number_list, filename_list = get_number_list(number, appoint_number, file_path) + n_list = number_list[:1] + filename_list + #处理重定向 + hscangku_url = get_redirected_url(hscangku_url) + if not hscangku_url: + debug_info = '没有正确的 hscangku_url,无法刮削' + log_info += web_info + debug_info + raise Exception(debug_info) + for each in n_list: + real_url = f'{hscangku_url}/vodsearch/-------------.html?wd={each}&submit=' + # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit=' + debug_info = f'请求地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '网络请求错误: %s' % response + log_info += web_info + debug_info + raise Exception(debug_info) + search_page = etree.fromstring(response, etree.HTMLParser()) + result, number, title, real_url = get_real_url(search_page, n_list, hscangku_url) + # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit=' + if result: + break + else: + debug_info = '没有匹配的搜索结果' + log_info += web_info + debug_info + raise Exception(debug_info) + + debug_info = f'番号地址: {real_url} ' + log_info += web_info + debug_info + result, response = curl_html(real_url) + + if not result: + debug_info = '没有找到数据 %s ' % response + log_info += web_info + debug_info + raise Exception(debug_info) + + detail_page = etree.fromstring(response, etree.HTMLParser()) + number, title, actor, cover_url, tag = get_detail_info(detail_page, real_url, number, file_path) + actor_photo = get_actor_photo(actor) + + try: + dic = { + 'number': number, + 'title': title, + 'originaltitle': title, + 'actor': actor, + 'outline': '', + 'originalplot': '', + 'tag': tag, + 'release': '', + 'year': '', + 'runtime': '', + 'score': '', + 'series': '', + 'country': 'CN', + 'director': '', + 'studio': '', + 'publisher': '', + 'source': 'hscangku', + 'website': real_url, + 'actor_photo': actor_photo, + 'cover': cover_url, + 'poster': '', + 'extrafanart': '', + 'trailer': '', + 'image_download': False, + 'image_cut': 'no', + 'log_info': log_info, + 'error_info': '', + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + 'mosaic': '国产', + 'wanted': '', + } + debug_info = '数据获取成功!' + log_info += web_info + debug_info + dic['log_info'] = log_info + except Exception as e: + debug_info = '数据生成出错: %s' % str(e) + log_info += web_info + debug_info + raise Exception(debug_info) + + except Exception as e: + # print(traceback.format_exc()) + debug_info = str(e) + dic = { + 'title': '', + 'cover': '', + 'website': '', + 'log_info': log_info, + 'error_info': debug_info, + 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )), + } + dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}} + js = json.dumps( + dic, + ensure_ascii=False, + sort_keys=False, + indent=4, + separators=(',', ': '), + ) + return js + + +if __name__ == '__main__': + # yapf: disable + # print(main('GDCM-018')) + # print(main('国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度', file_path='国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度')) + # print(main('RS001', file_path='RS-001.红斯灯影像.REDSTEN.淫白大胜利.上.男女水中竞赛.败方被强制插入高潮连连')) + # print(main('MD-0269', file_path='MD-0269.梁佳芯.唐芯.换妻性爱淫元宵.正月十五操骚鲍.麻豆传媒映画原创中文原版收藏')) + # print(main('sh-006', file_path='SH-006.谢冰岚.神屌侠侣.是谁操了我的小龙女.涩会传媒')) + # print(main('PMC-085', file_path='PMC/PMC-085.雪霏.出差借宿小姨子乱伦姐夫.特别照顾的肉体答谢.蜜桃影像传媒.ts')) + # print(main('TM-0165', file_path='TM0165.王小妮.妈妈的性奴之路.性感少妇被儿子和同学调教成性奴.天美传媒')) + # print(main('mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒')) + # print(main('mini06', file_path='mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒')) + # print(main('mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒', file_path='mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒')) + # print(main('XSJ138', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) + # print(main('DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌', file_path='DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌')) + # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作')) + # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列')) + # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) + print(main('大像传媒之淫蕩刺青女學徒', file_path='大像传媒之淫蕩刺青女學徒')) + # print(main('MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样', file_path='MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样')) + # print(main('MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版', file_path='MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版')) + # print(main('MSD-023', file_path='MSD023.袁子仪.杨柳.可爱女孩非亲妹.渴望已久的(非)近亲性爱.麻豆传媒映画.Model.Seeding系列.mp4')) + # print(main('', file_path='夏日回忆 贰')) + # print(main('MDX-0016')) + # print(main('MDSJ-0004')) + # print(main('RS-020')) + # print(main('PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒', file_path='PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒')) + # print(main('老公在外出差家里的娇妻被入室小偷强迫性交 - 美酱')) + # print(main('', file_path='夏日回忆 贰 HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4')) + # print(main('', file_path='HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4')) + # print(main('', file_path="【HongKongDoll玩偶姐姐.短篇集.情人节特辑.Valentine's Day Special-cd2")) + # print(main('', file_path='PMC-062 唐茜.綠帽丈夫連同新弟怒操出軌老婆.強拍淫蕩老婆被操 唐茜.ts')) + # print(main('', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画')) + # print(main('淫欲游戏王.EP6', appoint_number='淫欲游戏王.EP5', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) # EP不带.才能搜到 + # print(main('', file_path='PMS-003.职场冰与火.EP3设局.宁静.苏文文.设局我要女人都臣服在我胯下.蜜桃影像传媒')) + # print(main('', file_path='PMS-001 性爱公寓EP04 仨人.蜜桃影像传媒.ts')) + # print(main('', file_path='PMS-001.性爱公寓EP03.ts')) + # print(main('', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.ts')) + # print(main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) + # main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts') + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱')) # 简体搜不到 + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木结爱.TS')) + # '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛', '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-', ' 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛'] + # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱.TS')) + # print(main('', file_path='PMS-001 性爱公寓EP02 女王 蜜桃影像传媒 -莉娜乔安.TS')) + # print(main('91CM-081', file_path='91CM-081.田恬.李琼.继母与女儿.三.爸爸不在家先上妹妹再玩弄母亲.果冻传媒.mp4')) + # print(main('91CM-081', file_path='MDJ-0001.EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.麻豆传媒映画.mp4')) + # print(main('91CM-081', file_path='MDJ0001 EP2 AV 淫兽鬼父 陈美惠 .TS')) + # print(main('91CM-081', file_path='MXJ-0005.EP1.弥生美月.小恶魔高校生.与老师共度的放浪补课.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='PH-US-002.色控.音乐老师全裸诱惑.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.TS')) + # print(main('91CM-081', file_path='MD-0140-2.蜜苏.家有性事EP2.爱在身边.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDUS系列[中文字幕].LAX0025.性感尤物渴望激情猛操.RUCK ME LIKE A SEX DOLL.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='REAL野性派001-朋友的女友讓我最上火.TS')) + # print(main('91CM-081', file_path='MDS-009.张芸熙.巨乳旗袍诱惑.搔首弄姿色气满点.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDS005 被雇主强上的熟女家政妇 大声呻吟被操到高潮 杜冰若.mp4.TS')) + # print(main('91CM-081', file_path='TT-005.孟若羽.F罩杯性感巨乳DJ.麻豆出品x宫美娱乐.TS')) + # print(main('91CM-081', file_path='台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品.TS')) + # print(main('91CM-081', file_path='PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='鲍鱼游戏SquirtGame.吸舔碰糖.失败者屈辱凌辱.TS')) + # print(main('91CM-081', file_path='导演系列 外卖员的色情体验 麻豆传媒映画.TS')) + # print(main('91CM-081', file_path='MDS007 骚逼女友在作妖-硬上男友当玩具 叶一涵.TS')) + # print(main('MDM-002')) # 去掉标题最后的发行商 + # print(main('MDS-007')) # 数字要四位才能搜索到,即 MDS-0007 MDJ001 EP1 我的女优物语陈美惠.TS + # print(main('MDS-007', file_path='MDJ001 EP1 我的女优物语陈美惠.TS')) # 数字要四位才能搜索到,即 MDJ-0001.EP1 + # print(main('91CM-090')) # 带横线才能搜到 + # print(main('台湾SWAG chloebabe 剩蛋特辑 干爆小鹿')) # 带空格才能搜到 + # print(main('淫欲游戏王EP2')) # 不带空格才能搜到 + # print(main('台湾SWAG-chloebabe-剩蛋特輯-幹爆小鹿')) + # print(main('MD-0020')) + # print(main('mds009')) + # print(main('mds02209')) + # print(main('女王的SM调教')) + # print(main('91CM202')) + # print(main('91CM-202')) From ba58501b4b2f70285bdd8fe6444d42e4100272d8 Mon Sep 17 00:00:00 2001 From: runoob Date: Mon, 5 Feb 2024 10:27:50 +0800 Subject: [PATCH 12/12] Fix: PR compliance revisions --- src/models/core/nfo.py | 4 +-- src/models/crawlers/cableav.py | 51 +++++-------------------------- src/models/crawlers/guochan.py | 37 +++++++++++++++++++++++ src/models/crawlers/hscangku.py | 53 +++++---------------------------- src/models/crawlers/madouqu.py | 41 ++----------------------- src/models/crawlers/mdtv.py | 2 +- 6 files changed, 57 insertions(+), 131 deletions(-) diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py index 2143a86..31c8bcc 100644 --- a/src/models/core/nfo.py +++ b/src/models/core/nfo.py @@ -81,8 +81,8 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal nfo_title = config.naming_media if not number: number = title - #默认emby视频标题配置为 [number title],国产重复时需去掉一个,去重需注意空格也应一起去掉,否则国产的nfo标题中会多一个空格 - #读取nfo title信息会去掉前面的number和空格以保留title展示出来,同时number和标题一致时,去掉number的逻辑变成去掉整个标题导致读取失败,见424行 + # 默认emby视频标题配置为 [number title],国产重复时需去掉一个,去重需注意空格也应一起去掉,否则国产的nfo标题中会多一个空格 + # 读取nfo title信息会去掉前面的number和空格以保留title展示出来,同时number和标题一致时,去掉number的逻辑变成去掉整个标题导致读取失败,见426行 if number == title and 'number' in nfo_title and 'title' in nfo_title: nfo_title = nfo_title.replace('originaltitle', '').replace('title', '').strip() first_letter = get_number_first_letter(number) diff --git a/src/models/crawlers/cableav.py b/src/models/crawlers/cableav.py index 52b8164..f486939 100644 --- a/src/models/crawlers/cableav.py +++ b/src/models/crawlers/cableav.py @@ -11,49 +11,12 @@ from models.base.web import curl_html from models.config.config import config from models.crawlers.guochan import get_number_list -from models.crawlers.guochan import get_actor_list, get_lable_list +from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info urllib3.disable_warnings() # yapf: disable # import traceback -def get_some_info(title, file_path, info_type, tag='', actor='', series=''): - - all_info = title + file_path + tag + actor + series - - # 未找到标签时,从各种信息里匹配 - if info_type == "tag": - tag_list = [] - all_tag = get_lable_list() - for each in all_tag: - if each in all_info.upper(): - tag_list.append(each) - new_tag_list = [] - [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] - return ','.join(new_tag_list) - - # 未找到演员时,看热门演员是否在标题和各种信息里 - if info_type == "actor": - actor_list = [] - all_actor = get_actor_list() - for each in all_actor: - if each in all_info.upper(): - actor_list.append(each) - new_actor_list = [] - [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] - return ','.join(new_actor_list) - - # 未找到系列时,从各种信息里匹配 - if info_type == "series": - series_list = [] - all_series = get_lable_list() - for each in all_series: - if each in all_info.upper(): - series_list.append(each) - new_series_list = [] - [new_series_list.append(i) for i in series_list if i and i not in new_series_list] - return ','.join(new_series_list) - def get_actor_photo(actor): actor = actor.split(',') data = {} @@ -66,7 +29,7 @@ def get_actor_photo(actor): def get_detail_info(html, number, file_path): title_h1 = html.xpath('//div[@class="entry-content "]/p/text()') title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number - actor = get_some_info(title, file_path, info_type="actor") + actor = get_extra_info(title, file_path, info_type="actor") tmp_tag = html.xpath('//header//div[@class="categories-wrap"]/a/text()') # 标签转简体 tag = zhconv.convert(tmp_tag[0], 'zh-cn') if tmp_tag else '' @@ -221,11 +184,11 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作')) # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列')) # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) - print(main('真实记录反差', file_path='真实记录反差')) - # print(main('MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样', file_path='MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样')) - # print(main('MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版', file_path='MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版')) - # print(main('MSD-023', file_path='MSD023.袁子仪.杨柳.可爱女孩非亲妹.渴望已久的(非)近亲性爱.麻豆传媒映画.Model.Seeding系列.mp4')) - # print(main('', file_path='夏日回忆 贰')) + # print(main('SSN010')) + # print(main('國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露', file_path='國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露')) + # print(main('國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜', file_path='國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜')) + # print(main('韓國高端攝影頂 Yeha 私拍福利', file_path='韓國高端攝影頂 Yeha 私拍福利')) + print(main('EMTC-005', file_path='國產AV 愛神傳媒 EMTC005 怒操高冷社長秘書 米歐')) # print(main('MDX-0016')) # print(main('MDSJ-0004')) # print(main('RS-020')) diff --git a/src/models/crawlers/guochan.py b/src/models/crawlers/guochan.py index 6dd1d38..a8bec9b 100644 --- a/src/models/crawlers/guochan.py +++ b/src/models/crawlers/guochan.py @@ -278,6 +278,43 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番 [new_filename_list.append(i) for i in filename_list if i and i not in new_filename_list] return new_number_list, new_filename_list +def get_extra_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配,忽略大小写 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if re.search(f'{each}', all_info, re.IGNORECASE): + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里,人名完全匹配 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if re.search(fr'\b{each}\b', all_info, re.IGNORECASE): + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配,没有相关数据,预留逻辑 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info.upper(): + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) + if __name__ == '__main__': # yapf: disable diff --git a/src/models/crawlers/hscangku.py b/src/models/crawlers/hscangku.py index 20592b9..9b84373 100644 --- a/src/models/crawlers/hscangku.py +++ b/src/models/crawlers/hscangku.py @@ -11,49 +11,12 @@ from models.base.web import curl_html from models.config.config import config from models.crawlers.guochan import get_number_list -from models.crawlers.guochan import get_actor_list, get_lable_list +from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info urllib3.disable_warnings() # yapf: disable # import traceback -def get_some_info(title, file_path, info_type, tag='', actor='', series=''): - - all_info = title + file_path + tag + actor + series - - # 未找到标签时,从各种信息里匹配 - if info_type == "tag": - tag_list = [] - all_tag = get_lable_list() - for each in all_tag: - if each in all_info.upper(): - tag_list.append(each) - new_tag_list = [] - [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] - return ','.join(new_tag_list) - - # 未找到演员时,看热门演员是否在标题和各种信息里 - if info_type == "actor": - actor_list = [] - all_actor = get_actor_list() - for each in all_actor: - if each in all_info.upper(): - actor_list.append(each) - new_actor_list = [] - [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] - return ','.join(new_actor_list) - - # 未找到系列时,从各种信息里匹配 - if info_type == "series": - series_list = [] - all_series = get_lable_list() - for each in all_series: - if each in all_info.upper(): - series_list.append(each) - new_series_list = [] - [new_series_list.append(i) for i in series_list if i and i not in new_series_list] - return ','.join(new_series_list) - def get_actor_photo(actor): actor = actor.split(',') data = {} @@ -67,8 +30,8 @@ def get_detail_info(html, real_url, number, file_path): href = re.split(r'[/.]', real_url)[-2] title_h1 = html.xpath('//h3[@class="title" and not(contains(normalize-space(.), "目录")) and not(contains(normalize-space(.), "为你推荐"))]/text()') title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number - actor = get_some_info(title, file_path, info_type="actor") - tag = get_some_info(title, file_path, info_type="tag") + actor = get_extra_info(title, file_path, info_type="actor") + tag = get_extra_info(title, file_path, info_type="tag") cover_url = html.xpath(f'//a[@data-original and contains(@href,"{href}")]/@data-original') cover_url = cover_url[0] if cover_url else '' @@ -240,11 +203,11 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作')) # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列')) # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品')) - print(main('大像传媒之淫蕩刺青女學徒', file_path='大像传媒之淫蕩刺青女學徒')) - # print(main('MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样', file_path='MAN麻豆女性向系列.MAN-0011.岚湘庭.当男人恋爱时.我可以带你去流浪.也知道下场不怎么样')) - # print(main('MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版', file_path='MDL-0009-2.楚梦舒.苏语棠.致八零年代的我们.年少的性欲和冲动.麻豆传媒映画原创中文收藏版')) - # print(main('MSD-023', file_path='MSD023.袁子仪.杨柳.可爱女孩非亲妹.渴望已久的(非)近亲性爱.麻豆传媒映画.Model.Seeding系列.mp4')) - # print(main('', file_path='夏日回忆 贰')) + # print(main('大像传媒之淫蕩刺青女學徒', file_path='大像传媒之淫蕩刺青女學徒')) + # print(main('冠希传媒GX-017强上弟弟的巨乳姐姐', file_path='冠希传媒GX-017强上弟弟的巨乳姐姐')) + # print(main('[SWAG]XHX-0014宅男的公仔幻化成人', file_path='[SWAG]XHX-0014宅男的公仔幻化成人')) + # print(main('IDG5401')) + print(main('大像传媒之長腿癡女代表情慾作-米歐', file_path='大像传媒之長腿癡女代表情慾作-米歐')) # print(main('MDX-0016')) # print(main('MDSJ-0004')) # print(main('RS-020')) diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py index 6ee0863..57e9ad2 100644 --- a/src/models/crawlers/madouqu.py +++ b/src/models/crawlers/madouqu.py @@ -10,50 +10,13 @@ from models.base.web import curl_html from models.crawlers.guochan import get_number_list from models.config.config import config -from models.crawlers.guochan import get_actor_list, get_lable_list +from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info urllib3.disable_warnings() # yapf: disable # import traceback -def get_some_info(title, file_path, info_type, tag='', actor='', series=''): - - all_info = title + file_path + tag + actor + series - - # 未找到标签时,从各种信息里匹配 - if info_type == "tag": - tag_list = [] - all_tag = get_lable_list() - for each in all_tag: - if each in all_info.upper(): - tag_list.append(each) - new_tag_list = [] - [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] - return ','.join(new_tag_list) - - # 未找到演员时,看热门演员是否在标题和各种信息里 - if info_type == "actor": - actor_list = [] - all_actor = get_actor_list() - for each in all_actor: - if each in all_info.upper(): - actor_list.append(each) - new_actor_list = [] - [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] - return ','.join(new_actor_list) - - # 未找到系列时,从各种信息里匹配 - if info_type == "series": - series_list = [] - all_series = get_lable_list() - for each in all_series: - if each in all_info.upper(): - series_list.append(each) - new_series_list = [] - [new_series_list.append(i) for i in series_list if i and i not in new_series_list] - return ','.join(new_series_list) - def get_actor_photo(actor): actor = actor.split(',') data = {} @@ -86,7 +49,7 @@ def get_detail_info(html, number, file_path): cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src') cover_url = cover_url[0] if cover_url else '' # print(number, title, actor, cover_url, studio, detail_info) - actor = get_some_info(title, file_path, info_type="actor") if actor == '' else actor + actor = get_extra_info(title, file_path, info_type="actor") if actor == '' else actor return number, title, actor, cover_url, studio diff --git a/src/models/crawlers/mdtv.py b/src/models/crawlers/mdtv.py index d7b3647..511b861 100644 --- a/src/models/crawlers/mdtv.py +++ b/src/models/crawlers/mdtv.py @@ -40,7 +40,7 @@ def get_some_info(html, title, file_path): # 未找到演员时,看热门演员是否在标题和各种信息里 series = series_list[0] if series_list else '' tag = ','.join(tag_list) - actor_fake_name = any ('未知' in item for item in actor_list) + actor_fake_name = any('未知' in item for item in actor_list) actor_list = [] if actor_fake_name else actor_list if not actor_list: all_info = title + series + tag + file_path