diff --git a/src/controllers/main_window/main_window.py b/src/controllers/main_window/main_window.py
index b471cf3..3b3676b 100644
--- a/src/controllers/main_window/main_window.py
+++ b/src/controllers/main_window/main_window.py
@@ -2061,6 +2061,8 @@ def _netResult(self):
'mdtv': ['https://www.mdpjzip.xyz', ''],
'madouqu': ['https://madouqu.com', ''],
'cnmdb': ['https://cnmdb.net', ''],
+ 'hscangku': ['https://hscangku.net', ''],
+ 'cableav': ['https://cableav.tv', ''],
'lulubar': ['https://lulubar.co', ''],
'love6': ['https://love6.tv', ''],
'yesjav': ['http://www.yesjav.info', ''],
diff --git a/src/models/config/config_manual.py b/src/models/config/config_manual.py
index e97a984..5c59e21 100644
--- a/src/models/config/config_manual.py
+++ b/src/models/config/config_manual.py
@@ -67,6 +67,8 @@ class ManualConfig:
'lulubar',
'madouqu',
'mdtv',
+ 'hscangku',
+ 'cableav',
'mgstage',
'mywife',
'prestige',
@@ -513,6 +515,8 @@ class ManualConfig:
'mdtv': 'mdtv',
'mdpjzip': 'mdtv',
'madouqu': 'madouqu',
+ 'hsck': 'hscangku',
+ 'cableav': 'cableav',
'mgstage': 'mgstage',
'7mmtv': '7mmtv',
'bb9711': '7mmtv',
diff --git a/src/models/core/crawler.py b/src/models/core/crawler.py
index 4e3d078..aebeba1 100644
--- a/src/models/core/crawler.py
+++ b/src/models/core/crawler.py
@@ -11,7 +11,7 @@
from models.core.flags import Flags
from models.crawlers import airav_cc_new, airav_new, avsex, avsox, cnmdb, dahlia, dmm, faleno, fantastica, fc2, fc2club, \
fc2hub, freejavbt, getchu, getchu_dmm, giga, hdouban, iqqtv_new, jav321, javbus, javdb, javlibrary_new, kin8, love6, \
- lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity
+ lulubar, madouqu, mdtv, mgstage, mmtv, mywife, official, prestige, theporndb, xcity, hscangku, cableav
from models.entity.enums import FileMode
@@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
elif website == 'mgstage':
json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number))
elif website == '7mmtv':
- json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language))
+ json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path))
elif website == 'fc2':
json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language))
elif website == 'fc2hub':
@@ -137,6 +137,12 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
elif website == 'madouqu':
json_data = json.loads(
madouqu.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number))
+ elif website == 'hscangku':
+ json_data = json.loads(
+ hscangku.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number))
+ elif website == 'cableav':
+ json_data = json.loads(
+ cableav.main(file_number, appoint_url, log_info, req_web, language, file_path, appoint_number))
elif website == 'getchu':
json_data = json.loads(getchu.main(file_number, appoint_url, log_info, req_web, language))
elif website == 'getchu_dmm':
diff --git a/src/models/core/nfo.py b/src/models/core/nfo.py
index a0467e0..31c8bcc 100644
--- a/src/models/core/nfo.py
+++ b/src/models/core/nfo.py
@@ -81,8 +81,10 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
nfo_title = config.naming_media
if not number:
number = title
+ # 默认emby视频标题配置为 [number title],国产重复时需去掉一个,去重需注意空格也应一起去掉,否则国产的nfo标题中会多一个空格
+ # 读取nfo title信息会去掉前面的number和空格以保留title展示出来,同时number和标题一致时,去掉number的逻辑变成去掉整个标题导致读取失败,见426行
if number == title and 'number' in nfo_title and 'title' in nfo_title:
- nfo_title = nfo_title.replace('originaltitle', '').replace('title', '')
+ nfo_title = nfo_title.replace('originaltitle', '').replace('title', '').strip()
first_letter = get_number_first_letter(number)
# 处理演员
@@ -204,20 +206,29 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
# 输出国家
if 'country,' in nfo_include_new:
print(f" {country}", file=code)
-
- # 输出演员
+
+ #初始化 actor_list
+ actor_list = []
+ # 输出男女演员
if 'actor_all,' in nfo_include_new:
actor = all_actor
- if actor and actor != '未知演员' and actor != '未知演員' and 'actor,' in nfo_include_new:
+ # 有演员时输出演员
+ if 'actor,' in nfo_include_new and actor:
actor_list = actor.split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白
- if actor_list:
- for each in actor_list:
- print(" ", file=code)
- print(" " + each + "", file=code)
- print(" Actor", file=code)
- print(" ", file=code)
-
+ # 无演员时输出演员 以文件命名设置中未知演员设置项为演员名,默认设置和空值不写入NFO
+ elif 'actor,' in nfo_include_new and config.actor_no_name not in ["未知演员",'未知演員','']:
+ actor = config.actor_no_name
+ actor_list = actor.split(',') # 字符串转列表
+ actor_list = [actor.strip() for actor in actor_list if actor.strip()] # 去除空白
+ signal.add_log(f'⛑️ 无演员名, 使用手动命名 写入NFO {config.actor_no_name}')
+ if actor_list:
+ for each in actor_list:
+ print(" ", file=code)
+ print(" " + each + "", file=code)
+ print(" Actor", file=code)
+ print(" ", file=code)
+
# 输出导演
if director and 'director,' in nfo_include_new:
print(" " + director + "", file=code)
@@ -318,10 +329,12 @@ def write_nfo(json_data, nfo_new_path, folder_new_path, file_path, edit_mode=Fal
print(" " + website + "", file=code)
# javdb id 输出, 没有时使用番号搜索页
- if 'javdbid' in json_data_nfo and json_data_nfo['javdbid']:
- print(" " + json_data_nfo["javdbid"] + "", file=code)
- else:
- print(" " + number + "", file=code)
+ if 'javdbid' in json_data_nfo:
+ # 其他非javdb网站取消强制输出该字段
+ if json_data_nfo['javdbid']:
+ print(" " + json_data_nfo["javdbid"] + "", file=code)
+ else:
+ print(" " + number + "", file=code)
print("", file=code)
json_data['logs'] += "\n 🍀 Nfo done! (new)(%ss)" % get_used_time(start_time)
return True
diff --git a/src/models/crawlers/cableav.py b/src/models/crawlers/cableav.py
new file mode 100644
index 0000000..f486939
--- /dev/null
+++ b/src/models/crawlers/cableav.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import re
+import time
+
+import urllib3
+import zhconv
+from lxml import etree
+
+from models.base.web import curl_html
+from models.config.config import config
+from models.crawlers.guochan import get_number_list
+from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info
+
+urllib3.disable_warnings() # yapf: disable
+
+# import traceback
+
+def get_actor_photo(actor):
+ actor = actor.split(',')
+ data = {}
+ for i in actor:
+ actor_photo = {i: ''}
+ data.update(actor_photo)
+ return data
+
+
+def get_detail_info(html, number, file_path):
+ title_h1 = html.xpath('//div[@class="entry-content "]/p/text()')
+ title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number
+ actor = get_extra_info(title, file_path, info_type="actor")
+ tmp_tag = html.xpath('//header//div[@class="categories-wrap"]/a/text()')
+ # 标签转简体
+ tag = zhconv.convert(tmp_tag[0], 'zh-cn') if tmp_tag else ''
+ cover_url = html.xpath(f'//meta[@property="og:image"]/@content')
+ cover_url = cover_url[0] if cover_url else ''
+
+ return number, title, actor, cover_url, tag
+
+
+def get_real_url(html, number_list):
+ item_list = html.xpath('//h3[contains(@class,"title")]//a[@href and @title]')
+ for each in item_list:
+ #href="https://cableav.tv/Xq1Sg3SvZPk/"
+ detail_url = each.get('href')
+ title = each.xpath('text()')[0]
+ if title and detail_url:
+ for n in number_list:
+ temp_n = re.sub(r'[\W_]', '', n).upper()
+ temp_title = re.sub(r'[\W_]', '', title).upper()
+ if temp_n in temp_title:
+ return True, n, title, detail_url
+ return False, '', '', ''
+
+
+def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''):
+ start_time = time.time()
+ website_name = 'cableav'
+ req_web += '-> %s' % website_name
+ title = ''
+ cover_url = ''
+ web_info = '\n '
+ log_info += ' \n 🌐 cableav'
+ debug_info = ''
+ real_url = appoint_url
+ cableav_url = getattr(config, 'cableav_website', 'https://cableav.tv')
+
+ try:
+ if not real_url:
+ # 处理番号
+ number_list, filename_list = get_number_list(number, appoint_number, file_path)
+ n_list = number_list[:1] + filename_list
+ for each in n_list:
+ real_url = f'{cableav_url}/?s={each}'
+ # real_url = 'https://cableav.tv/s?s=%E6%9F%9A%E5%AD%90%E7%8C%AB'
+ debug_info = f'请求地址: {real_url} '
+ log_info += web_info + debug_info
+ result, response = curl_html(real_url)
+ if not result:
+ debug_info = '网络请求错误: %s' % response
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+ search_page = etree.fromstring(response, etree.HTMLParser())
+ result, number, title, real_url = get_real_url(search_page, n_list)
+ # real_url = 'https://cableav.tv/hyfaqwfjhio'
+ if result:
+ break
+ else:
+ debug_info = '没有匹配的搜索结果'
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ debug_info = f'番号地址: {real_url} '
+ log_info += web_info + debug_info
+ result, response = curl_html(real_url)
+
+ if not result:
+ debug_info = '没有找到数据 %s ' % response
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ detail_page = etree.fromstring(response, etree.HTMLParser())
+ number, title, actor, cover_url, tag = get_detail_info(detail_page, number, file_path)
+ actor_photo = get_actor_photo(actor)
+
+ try:
+ dic = {
+ 'number': number,
+ 'title': title,
+ 'originaltitle': title,
+ 'actor': actor,
+ 'outline': '',
+ 'originalplot': '',
+ 'tag': tag,
+ 'release': '',
+ 'year': '',
+ 'runtime': '',
+ 'score': '',
+ 'series': '',
+ 'country': 'CN',
+ 'director': '',
+ 'studio': '',
+ 'publisher': '',
+ 'source': 'cableav',
+ 'website': real_url,
+ 'actor_photo': actor_photo,
+ 'cover': cover_url,
+ 'poster': '',
+ 'extrafanart': '',
+ 'trailer': '',
+ 'image_download': False,
+ 'image_cut': 'no',
+ 'log_info': log_info,
+ 'error_info': '',
+ 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+ 'mosaic': '国产',
+ 'wanted': '',
+ }
+ debug_info = '数据获取成功!'
+ log_info += web_info + debug_info
+ dic['log_info'] = log_info
+ except Exception as e:
+ debug_info = '数据生成出错: %s' % str(e)
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ except Exception as e:
+ # print(traceback.format_exc())
+ debug_info = str(e)
+ dic = {
+ 'title': '',
+ 'cover': '',
+ 'website': '',
+ 'log_info': log_info,
+ 'error_info': debug_info,
+ 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+ }
+ dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}}
+ js = json.dumps(
+ dic,
+ ensure_ascii=False,
+ sort_keys=False,
+ indent=4,
+ separators=(',', ': '),
+ )
+ return js
+
+
+if __name__ == '__main__':
+ # yapf: disable
+ # print(main('GDCM-018'))
+ # print(main('国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度', file_path='国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度'))
+ # print(main('RS001', file_path='RS-001.红斯灯影像.REDSTEN.淫白大胜利.上.男女水中竞赛.败方被强制插入高潮连连'))
+ # print(main('MD-0269', file_path='MD-0269.梁佳芯.唐芯.换妻性爱淫元宵.正月十五操骚鲍.麻豆传媒映画原创中文原版收藏'))
+ # print(main('sh-006', file_path='SH-006.谢冰岚.神屌侠侣.是谁操了我的小龙女.涩会传媒'))
+ # print(main('PMC-085', file_path='PMC/PMC-085.雪霏.出差借宿小姨子乱伦姐夫.特别照顾的肉体答谢.蜜桃影像传媒.ts'))
+ # print(main('TM-0165', file_path='TM0165.王小妮.妈妈的性奴之路.性感少妇被儿子和同学调教成性奴.天美传媒'))
+ # print(main('mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒'))
+ # print(main('mini06', file_path='mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒'))
+ # print(main('mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒', file_path='mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒'))
+ # print(main('XSJ138', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品'))
+ # print(main('DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌', file_path='DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌'))
+ # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作'))
+ # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列'))
+ # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品'))
+ # print(main('SSN010'))
+ # print(main('國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露', file_path='國產AV 麻豆傳媒 MD0312 清純嫩穴賣身葬父 露露'))
+ # print(main('國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜', file_path='國產AV 大象傳媒 DA002 性感魅惑色兔兔 李娜娜'))
+ # print(main('韓國高端攝影頂 Yeha 私拍福利', file_path='韓國高端攝影頂 Yeha 私拍福利'))
+ print(main('EMTC-005', file_path='國產AV 愛神傳媒 EMTC005 怒操高冷社長秘書 米歐'))
+ # print(main('MDX-0016'))
+ # print(main('MDSJ-0004'))
+ # print(main('RS-020'))
+ # print(main('PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒', file_path='PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒'))
+ # print(main('老公在外出差家里的娇妻被入室小偷强迫性交 - 美酱'))
+ # print(main('', file_path='夏日回忆 贰 HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4'))
+ # print(main('', file_path='HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4'))
+ # print(main('', file_path="【HongKongDoll玩偶姐姐.短篇集.情人节特辑.Valentine's Day Special-cd2"))
+ # print(main('', file_path='PMC-062 唐茜.綠帽丈夫連同新弟怒操出軌老婆.強拍淫蕩老婆被操 唐茜.ts'))
+ # print(main('', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画'))
+ # print(main('淫欲游戏王.EP6', appoint_number='淫欲游戏王.EP5', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) # EP不带.才能搜到
+ # print(main('', file_path='PMS-003.职场冰与火.EP3设局.宁静.苏文文.设局我要女人都臣服在我胯下.蜜桃影像传媒'))
+ # print(main('', file_path='PMS-001 性爱公寓EP04 仨人.蜜桃影像传媒.ts'))
+ # print(main('', file_path='PMS-001.性爱公寓EP03.ts'))
+ # print(main('', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.ts'))
+ # print(main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts'))
+ # main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱')) # 简体搜不到
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木结爱.TS'))
+ # '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛', '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-', ' 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛']
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱.TS'))
+ # print(main('', file_path='PMS-001 性爱公寓EP02 女王 蜜桃影像传媒 -莉娜乔安.TS'))
+ # print(main('91CM-081', file_path='91CM-081.田恬.李琼.继母与女儿.三.爸爸不在家先上妹妹再玩弄母亲.果冻传媒.mp4'))
+ # print(main('91CM-081', file_path='MDJ-0001.EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.麻豆传媒映画.mp4'))
+ # print(main('91CM-081', file_path='MDJ0001 EP2 AV 淫兽鬼父 陈美惠 .TS'))
+ # print(main('91CM-081', file_path='MXJ-0005.EP1.弥生美月.小恶魔高校生.与老师共度的放浪补课.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='PH-US-002.色控.音乐老师全裸诱惑.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.TS'))
+ # print(main('91CM-081', file_path='MD-0140-2.蜜苏.家有性事EP2.爱在身边.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDUS系列[中文字幕].LAX0025.性感尤物渴望激情猛操.RUCK ME LIKE A SEX DOLL.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='REAL野性派001-朋友的女友讓我最上火.TS'))
+ # print(main('91CM-081', file_path='MDS-009.张芸熙.巨乳旗袍诱惑.搔首弄姿色气满点.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDS005 被雇主强上的熟女家政妇 大声呻吟被操到高潮 杜冰若.mp4.TS'))
+ # print(main('91CM-081', file_path='TT-005.孟若羽.F罩杯性感巨乳DJ.麻豆出品x宫美娱乐.TS'))
+ # print(main('91CM-081', file_path='台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品.TS'))
+ # print(main('91CM-081', file_path='PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='鲍鱼游戏SquirtGame.吸舔碰糖.失败者屈辱凌辱.TS'))
+ # print(main('91CM-081', file_path='导演系列 外卖员的色情体验 麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDS007 骚逼女友在作妖-硬上男友当玩具 叶一涵.TS'))
+ # print(main('MDM-002')) # 去掉标题最后的发行商
+ # print(main('MDS-007')) # 数字要四位才能搜索到,即 MDS-0007 MDJ001 EP1 我的女优物语陈美惠.TS
+ # print(main('MDS-007', file_path='MDJ001 EP1 我的女优物语陈美惠.TS')) # 数字要四位才能搜索到,即 MDJ-0001.EP1
+ # print(main('91CM-090')) # 带横线才能搜到
+ # print(main('台湾SWAG chloebabe 剩蛋特辑 干爆小鹿')) # 带空格才能搜到
+ # print(main('淫欲游戏王EP2')) # 不带空格才能搜到
+ # print(main('台湾SWAG-chloebabe-剩蛋特輯-幹爆小鹿'))
+ # print(main('MD-0020'))
+ # print(main('mds009'))
+ # print(main('mds02209'))
+ # print(main('女王的SM调教'))
+ # print(main('91CM202'))
+ # print(main('91CM-202'))
diff --git a/src/models/crawlers/guochan.py b/src/models/crawlers/guochan.py
index c1c08bd..a8bec9b 100644
--- a/src/models/crawlers/guochan.py
+++ b/src/models/crawlers/guochan.py
@@ -6,6 +6,7 @@
import urllib3
import zhconv
+from models.base.number import remove_escape_string
urllib3.disable_warnings() # yapf: disable
@@ -15,219 +16,69 @@
def get_lable_list():
- return ['传媒', '国产短视频', '国产精品', '国产AV', 'PsychoPorn色控', '叮叮映画', '涩会', '蜜桃影像传媒',
- '大番号番啪啪', 'REAL野性派', '豚豚创媒', '宫美娱乐', '肉肉传媒', '爱妃传媒', '天美传媒', '皇家华人',
- '91制片厂', '果冻传媒', 'O-STAR', '兔子先生', '杏吧原创', '杏吧独家', '辣椒原创', '麻豆传媒', '糖心',
- '麻豆传媒映画', '红斯灯影像', '绝对领域', '麻麻传媒', '渡边传媒', 'AV帝王', '桃花源', '蝌蚪传媒', 'SWAG',
- '麻豆', '杏吧']
+ return ["麻豆传媒", "91茄子", "Ed Mosaic", "HongKongDoll", "JVID", "MINI传媒", "SA国际传媒", "TWAV", "乌鸦传媒",
+ "乐播传媒", "优蜜传媒", "偶蜜国际", "叮叮映画", "哔哩传媒", "大象传媒", "天美传媒", "开心鬼传媒", "微密圈",
+ "扣扣传媒", "抖阴传媒", "星空无限传媒", "映秀传媒", "杏吧传媒", "果冻传媒", "模密传媒", "爱污传媒", "爱神传媒",
+ "爱豆传媒", "狂点映像", "猛料原创", "猫爪影像", "皇家华人", "精东影业", "糖心VLOG", "维秘传媒", "草莓视频", "萝莉社",
+ "蜜桃传媒", "西瓜影视", "起点传媒", "香蕉视频", "PsychoPorn色控", "蜜桃影像传媒", "大番号番啪啪", "REAL野性派", "豚豚创媒",
+ "宫美娱乐", "肉肉传媒", "爱妃传媒", "91制片厂", "O-STAR","兔子先生", "杏吧原创", "杏吧独家", "辣椒原创", "麻豆传媒映画", "红斯灯影像",
+ "绝对领域", "麻麻传媒", "渡边传媒", "AV帝王", "桃花源", "蝌蚪传媒", "SWAG", "麻豆", "杏吧", "糖心", "国产短视频", "国产精品", "国产AV", "涩会"]
def get_actor_list():
- return [
- '苏妲己',
- '苏畅',
- '宁洋子',
- '沈芯语',
- '艾秋',
- '吴梦梦',
- '尹媚',
- '张芸熙',
- '夏晴子',
- '白佳萱',
- '林思妤',
- '沈娜娜',
- '仙儿媛',
- '许安妮',
- '刘语珊',
- '刘思慧',
- '叶一涵',
- '林亦涵',
- '雪千夏',
- '欧美玲',
- '赵佳美',
- '李慕儿',
- '徐韵珊',
- '苏娅',
- '糖果屋',
- '王茜',
- '李婧琪',
- '夏滢',
- '顾伊梦',
- '杜冰若',
- '赵颖儿',
- '秦可欣',
- '莫安安',
- '安娜',
- '黎星若',
- '仙儿',
- '林予曦',
- '蒋佑怡',
- '许书曼',
- '白晶晶',
- '王有容',
- '琳希',
- '李恩琦',
- '赵美凤',
- '王欣',
- '徐筱欣',
- '黄雅曼',
- '伊靖瑶',
- '菲于娜',
- '罗瑾萱',
- '金丞熙',
- '李文雯',
- '苏清歌',
- '付妙菱',
- '钟丽琪',
- '张娅庭',
- '蜜苏',
- '凌薇',
- '叶凡舒',
- '董小宛',
- '程雨沫',
- '瑶贝',
- '郭瑶瑶',
- '李嘉欣',
- '辰悦',
- '李曼妮',
- '洛雪',
- '千鹤',
- '袁庭妮',
- '林思好',
- '张云熙',
- '杜鹃',
- '玛丽莲',
- '李茹',
- '何苗',
- '黄雪纯',
- '田恬',
- '李琼',
- '聂小倩',
- '张晴',
- '丁香',
- '林凤娇',
- '刘颖儿',
- '杨思敏',
- '李忆彤',
- '伊蒂丝',
- '绿帽先生',
- '戚小怜',
- '杨柳',
- '唐茜',
- '苏艾文',
- '张曼青',
- '斑斑',
- '孟若羽',
- '陈圆圆',
- '雷梦娜',
- '氖氖',
- '仙儿',
- '艾玛',
- '蔚曼',
- '静静',
- '艾瑞卡',
- '娜米',
- '莉娜',
- '乔安',
- '林子涵',
- '萱萱',
- '糖糖',
- '徐婕',
- '王欣',
- '白颖',
- '吴芮瑜',
- '韩棠',
- '季妍希',
- '沙耶香',
- '七七',
- '莉娜乔安',
- '美雪樱',
- '柚木结爱',
- '黑田奈奈',
- '王亦舒',
- '张雅婷',
- '李文静',
- '肖泳',
- '韩小雅',
- '神山奈奈',
- '白川麻衣',
- '茜茜',
- '夜夜',
- '高梨遥香',
- 'HongKongDoll',
- '玩偶姐姐',
- '蘇妲己',
- '蘇暢',
- '寧洋子',
- '沈芯語',
- '吳夢夢',
- '張芸熙',
- '仙兒媛',
- '許安妮',
- '劉語珊',
- '劉思慧',
- '葉一涵',
- '歐美玲',
- '趙佳美',
- '李慕兒',
- '徐韻珊',
- '蘇婭',
- '夏瀅',
- '顧伊夢',
- '趙穎兒',
- '仙兒',
- '蔣佑怡',
- '許書曼',
- '趙美鳳',
- '黃雅曼',
- '伊靖瑤',
- '羅瑾萱',
- '蘇清歌',
- '鍾麗琪',
- '張婭庭',
- '蜜蘇',
- '葉凡舒',
- '瑤貝',
- '郭瑤瑤',
- '辰悅',
- '千鶴',
- '張雲熙',
- '杜鵑',
- '瑪麗蓮',
- '黃雪純',
- '李瓊',
- '聶小倩',
- '張晴',
- '林鳳嬌',
- '劉穎兒',
- '楊思敏',
- '李憶彤',
- '伊蒂絲',
- '綠帽先生',
- '戚小憐',
- '楊柳',
- '蘇艾文',
- '張曼青',
- '陳圓圓',
- '雷夢娜',
- '仙兒',
- '艾瑪',
- '靜靜',
- '喬安',
- '白穎',
- '吳芮瑜',
- '韓棠',
- '莉娜喬安',
- '美雪櫻',
- '柚木結愛',
- '張雅婷',
- '李文靜',
- '韓小雅',
- '高梨遙香',
- ]
+ return ['Madison Summers', 'Spencer Bradley', 'Madison Morgan', 'Rosalyn Sphinx', 'Braylin Bailey', 'Whitney Wright', 'Victoria Voxxx', 'Alexia Anders',
+ 'Bella Rolland', 'Violet Myers', 'Sophia Leone', 'Violet Starr', 'Eliza Ibarra', 'HongKongDoll', 'Keira Croft', 'April Olsen', 'Avery Black',
+ 'Amber Moore', 'Anny Aurora', 'Skylar Snow', 'Harley Haze', 'Paige Owens', 'Vanessa Sky', 'MasukuChan', 'Kate Bloom', 'Kimmy Kimm', 'Ana Foxxx',
+ 'Lexi Luna', 'Gia Derza', 'Skye Blue', 'Nico Love', 'Alyx Star', 'Ryan Reid', 'Kira Noir', 'Karma Rx', '下面有根棒棒糖', 'Vivian姐', 'COLA酱',
+ 'cola醬', 'Stacy', 'ROXIE', '真木今日子', '小七软同学', 'Chloe', 'Alona', '小日向可怜', 'NANA', '玩偶姐姐', '粉色情人', '桥本香菜', '冉冉学姐', '小二先生',
+ '饼干姐姐', 'Rona', '不见星空', '米娜学姐', '阿蛇姐姐', '樱花小猫', '樱井美里', '宸荨樱桃', '樱空桃桃', '牛奶泡芙', '91兔兔', '棉花糖糖', '桥本爱菜',
+ '许木学长', 'MOMO', '驯鹿女孩', '高梨遥香', 'DORY', '冬月结衣', 'Aida', '香菜公主', '藤田美绪', '浅尾美羽', '天音美纱', '中条爱莉', '三月樱花', 'Emma',
+ 'Vita', '千夜喵喵', '水原圣子', '白川麻衣', '池田奈美', '西村莉娜', 'A天使爱', '中野惠子', '麻衣CC', '樱桃空空', 'LENA', '小泽纱香', '木下日葵', '中岛芳子',
+ '弥生美月', '逢见梨花', '宇佐爱花', '沙月芽衣', '羽月萌音', '前田由美', '伊东爱瑠', 'Misa', '绿帽先生', '莉娜乔安', '柚木结爱', '黑田奈奈', '神山奈奈',
+ '孟若羽', '夏晴子', '吴梦梦', '沈娜娜', '李蓉蓉', '林思妤', '仙儿媛', '金宝娜', '季妍希', '温芮欣', '吴文淇', '苏语棠', '秦可欣', '吴芳宜', '李娜娜',
+ '乐奈子', '宋南伊', '小水水', '白允儿', '管明美', '雪千夏', '苏清歌', '玥可岚', '梁芸菲', '白熙雨', '小敏儿', '楚梦舒', '柚子猫', '姚宛儿', '宋雨川',
+ '舒可芯', '苏念瑾', '白沛瑶', '林沁儿', '唐雨菲', '李允熙', '张芸熙', '寻小小', '白靖寒', '钟宛冰', '李薇薇', '米菲兔', '雷梦娜', '董悦悦', '袁子仪',
+ '赖畇希', '王以欣', '夏禹熙', '狐不妖', '凌波丽', '黎芷萱', '陆斑比', '辛尤里', '小猫咪', '顾桃桃', '南芊允', '岚湘庭', '林芊彤', '梁佳芯', '林凤娇',
+ '明日香', '绫波丽', '邓紫晴', '赵一曼', '吴茜茜', '锅锅酱', '倪哇哇', '潘雨曦', '吴恺彤', '美杜莎', '郭童童', '陈可心', '莫夕慈', '沈芯语', '董小宛',
+ '苏艾文', '翁雨澄', '赵晓涵', '小桃酱', '宋东琳', '香月怜', '李文雯', '白若冰', '徐夜夜', '真希波', '爱丽丝', '张宇芯', '金善雅', '李依依', '苏安亚',
+ '奶咪酱', '白葵司', '罗瑾萱', '宁洋子', '小夜夜', '白晶晶', '张雅婷', '吴心语', '林曼芸', '项子甯', '吳芳宜', '苏小小', '文冰冰', '韩宝儿', '白星雨',
+ '林怡梦', '张欣妍', '七濑恋', '白思吟', '吴凯彤', '溫芮欣', '林可菲', '黎芷媗', '御梦子', '苏雨彤', '古伊娜', '聂小倩', '陈圆圆', '沙美辰', '林妙可',
+ '乐淆雪', '李恩娜', '周晴晴', '杨思敏', '李曼妮', '戚小怜', '谢语彤', '王筱璐', '卢珊珊', '程诗诗', '林玥玥', '白瞳瞳', '魏乔安', '米胡桃', '施子涵',
+ '北野爱', '杜冰若', '玛丽莲', '胡蓉蓉', '万静雪', '花语柔', '萧悦儿', '林晓雪', '兰心洁', '神谷怜', '唐雨霏', '鸡蛋饼', '沈湘妮', '费爵娜', '小美惠',
+ '大奶露', '向若云', '苏小沫', '榨汁姬', '陈星然', '夏雨荷', '姚彤彤', '莫云雪', '艾瑞卡', '黄雪纯', '赵雅琳', '叶宸欣', '伊琬琳', '陈美惠', '金巧巧',
+ '陈美琳', '陆思涵', '顾小北', '陈小雨', '维里娜', '兔小白', '叶子红', '美凉子', '李丹彤', '李微微', '白婷婷', '艾米酱', '刘小姗', '白童童', '张琪琪',
+ 'Yua', '小玩子', '岚可彤', '都可可', '李慕儿', '叶一涵', '赵佳美', '董小婉', '钟丽琪', '韩小雅', '杨朵儿', '叶梦语', '程雨沫', '张曼青', '纪妍希', '伊婉琳',
+ '凌雨萱', '潘甜甜', '美竹玲', '韩依人', '奈奈子', '林雪漫', '宋甜甜', '陆雪琪', '宋妮可', '陆子欣', '范可可', '许依然', '苏小新', '蒋梦琳', '李可欣',
+ '小鹿酱', '小林杏', '陶杏儿', '明步奈', '苏宁儿', '白潼潼', '增田枫', '特污兔', '何安汝', '倪菀儿', '唐可可', '口罩酱', '小千绪', '糖糖儿', '许安妮',
+ '李婧琪', '刘思慧', '欧阳晶', '欧美玲', '林亦涵', '钟以彤', '许书曼', '付妙菱', '伊靖瑶', '张娅庭', '韩小野', '宫泽蓝', '冯思雨', '林小樱', '刘颖儿',
+ '莫潇潇', '胡心瑶', '林雨露', '苏婧薇', '许月珍', '陈若瑶', '吴芮瑜', '叶如梦', '刘依依', '吴语菲', '张妮妮', '林子涵', '张子瑜', '周卿卿', '李师师',
+ '苏语堂', '方紫璐', '袁采菱', '刘清韵', '李曼丽', '刘小雯', '姬咲华', '高小颜', '蔡晓雨', '梁如意', '林语桐', '王小妮', '唐月琴', '星谷瞳', '何小丽',
+ '张婉妍', '酒井爱', '张秀玲', '晚晚酱', '薛梦琪', '李乐乐', '张佳晨', '程媛媛', '沐语柔', '安琪拉', '韩倪希', '苏妲己', '白佳萱', '刘语珊', '徐韵珊',
+ '糖果屋', '顾伊梦', '赵颖儿', '莫安安', '黎星若', '林予曦', '蒋佑怡', '王有容', '李恩琦', '赵美凤', '徐筱欣', '黄雅曼', '菲于娜', '金丞熙', '叶凡舒',
+ '郭瑶瑶', '李嘉欣', '袁庭妮', '林思好', '张云熙', '李忆彤', '伊蒂丝', '沙耶香', '美雪樱', '王亦舒', '李文静', '鸡教练', '斑斑', '坏坏', '糖糖', '艾秋',
+ '凌薇', '莉娜', '韩棠', '苡若', '尤莉', '优娜', '林嫣', '徐蕾', '周甯', '唐茜', '香菱', '佳芯', '湘湘', '米欧', '斑比', '蜜苏', '小婕', '艾熙', '娃娃',
+ '艾玛', '雪霏', '夜夜', '欣欣', '乔安', '羽芮', '美酱', '师师', '玖玖', '橙子', '晨曦', '苏娅', '黎儿', '晨晨', '嘉洛', '小遥', '苏畅', '琪琪', '苡琍',
+ '李慕', '心萱', '昀希', '黎娜', '乐乐', '樱桃', '桐桐', '苏璇', '安娜', '悠悠', '茉莉', '宛冰', '静静', '丝丝', '菲菲', '樱樱', '波妮', '唐芯', '小野',
+ '何苗', '甜心', '瑶瑶', '小捷', '薇薇', '美樱', '宁静', '欧妮', '吉吉', '小桃', '绯丽', '嘉琪', '咪妮', '雯茜', '小洁', '李琼', '唐霏', '岚玥', '熙熙',
+ '米娅', '舒舒', '斯斯', '欣怡', '妍儿', '阿雅', '宋可', '畇希', '柔伊', '雅沁', '惠敏', '露露', '艾悠', '娜娜', '李娜', '肖云', '王玥', '林洋', '清洛',
+ '艾鲤', '依涵', '半雪', '琦琦', '莎莎', '小冉', '琳怡', '莉奈', '梅子', '啤儿', '瑶贝', '杨柳', '童汐', '米亚', '琳达', '晴天', 'KK', '紫宸', '淑怡',
+ '花花', '金铭', '程葳', '妍希', '咪妃', '茜茜', '小蜜', '凌萱', '觅嫣', '涵涵', '欲梦', '美琳', '杜鹃', '许诺', '兮兮', '白鹿', '虞姬', '丽萨', '蔷薇',
+ '小影', '优优', '茶茶', '可儿', '甜甜', '憨憨', '波尼', '依颂', '依依', '思思', '芳情', '月牙', '小爱', '淳儿', '苗方', '茶理', '苹果', '苏然', '陶子',
+ '董欣', '羽熙', '清沐', '林襄', '娃诺', '洁咪', '小昭', '球球', '紫萱', '南兰', '安琪', '可乐', '夏露', '诗琪', '陈韵', '丽娜', '苏旋', '月月', '石榴',
+ '米兰', '恩恩', '西子', '芷萱', '酥酥', '王茜', '千鹤', '雪见', '姜洁', '张晴', '辰悦', '丁香', '白颖', '穆娜', '小芳', '吉娜', '秋霞', '无双', '夏宝',
+ '舒涵', '小柔', '小小', '璇元', '米砂', '余丽', '美嘉', '莉莉', '奈奈', '黑糖', '晴子', '多乙', '徐婕', '闵闵', '小雪', '洋洋', '明儿', '苏茜', '芯怡',
+ '姚茜', '百合', '婉婷', '小乔', '芽芽', '婕珍', '乔乔', '紫寒', '小薇', '菜菜', '洁米', '夏天', '灵枝', '语伊', '徐艳', '王佩', '希汶', '雅捷', '喵喵',
+ '尤奈', '仙儿', '氖氖', '蔚曼', '田恬', '颂潮', '小婵', '千凌', '李燕', '林芳', '杨桃', '艾莉', '落落', '冯雪', '王蓉', '妖妖', '雨晨', '心雪', '穆雪',
+ '韩焉', '邱月', '檀雅', '柯柯', '七七', '鱼儿', '丹丹', '简一', '淑仪', '小哇', '朵儿', '妲己', '云朵', '唐菲', '邦妮', '白英', '夏夏', '安安', '小艺',
+ '丽丽', '敏敏', '空空', '椿芽', '小言', '李蕊', '水水', '小鱼', '艾艾', '尹媚', '夏滢', '琳希', '王欣', '洛雪', '李茹', '娜米', '萱萱', '肖泳']
def get_number_list(number, appoint_number='', file_path=''): # 处理国产番号
+
+ # 国产匹配番号或标题前也可以先排除路径中多余字符
+ if file_path:
+ file_path = remove_escape_string(file_path)
+
file_name = os.path.splitext(os.path.split(file_path)[1])[0].upper() if file_path else ''
number = number.upper()
number_list = [] # 返回一个番号列表,用来搜索
@@ -357,7 +208,11 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番
number_list.extend([number_normal_4, number_has_nothing_4, number_has_space_4])
if len(number_list):
break
+ # 番号识别将纯数字和字母放在最前面(将长度最短的放前面即可),刮削网站一般也只取 number_list 第一项进行搜索,其他用于搜索结果页比对
+ sorted_number_list = sorted(number_list, key=lambda x: len(x))
+
+ # 以下处理没有番号的作品
# 台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品
# PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画
# 國産麻豆AV 麻豆番外 大番號女優空降上海 特別篇 沈芯語
@@ -403,16 +258,63 @@ def get_number_list(number, appoint_number='', file_path=''): # 处理国产番
# 把文件名加到列表
filename_list.append(real_file_name)
+ # 演员后面的第一句成功刮削概率较高,插入列表第一项
+ # 超级丝袜控180大长腿女神▌苹果▌我的室友 第八篇 黑丝女仆骚丁小穴湿淋淋 肉棒塞满激怼爆射
+ # 17205-最新极品天花板小萝莉▌粉色情人▌摄影师的威胁 粗屌爆艹少女白虎嫩鲍 极速刮擦蜜壶淫靡下体
+ # 潮喷淫娃御姐〖小水水〗和异地大奶女友开房,激情互舔口爆高潮喷水,黑丝美腿女神极度淫骚 潮喷不停
+ # 极品爆乳鲜嫩美穴貌美尤物▌苏美奈▌家政女仆的肉体服务 肏到羞耻喷汁 极射中出鲜嫩美穴
+ # 【小酒改头换面】,罕见大胸嫩妹,小伙今夜捡到宝了
+ if u := re.search(r'(【.+】|▌.+▌|〖.+〗|『.+』)[,,\- ]?(\S{6,18}?)[,,\- ]', real_file_name):
+ search_char = u.group(2)
+ filename_list.insert(0, search_char)
+
# 转繁体
filename_list.append(zhconv.convert(filename_list[0], 'zh-hant'))
# 去重去空
new_number_list = []
new_filename_list = []
- [new_number_list.append(i) for i in number_list if i and i not in new_number_list]
+ [new_number_list.append(i) for i in sorted_number_list if i and i not in new_number_list]
[new_filename_list.append(i) for i in filename_list if i and i not in new_filename_list]
return new_number_list, new_filename_list
+def get_extra_info(title, file_path, info_type, tag='', actor='', series=''):
+
+ all_info = title + file_path + tag + actor + series
+
+ # 未找到标签时,从各种信息里匹配,忽略大小写
+ if info_type == "tag":
+ tag_list = []
+ all_tag = get_lable_list()
+ for each in all_tag:
+ if re.search(f'{each}', all_info, re.IGNORECASE):
+ tag_list.append(each)
+ new_tag_list = []
+ [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list]
+ return ','.join(new_tag_list)
+
+ # 未找到演员时,看热门演员是否在标题和各种信息里,人名完全匹配
+ if info_type == "actor":
+ actor_list = []
+ all_actor = get_actor_list()
+ for each in all_actor:
+ if re.search(fr'\b{each}\b', all_info, re.IGNORECASE):
+ actor_list.append(each)
+ new_actor_list = []
+ [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list]
+ return ','.join(new_actor_list)
+
+ # 未找到系列时,从各种信息里匹配,没有相关数据,预留逻辑
+ if info_type == "series":
+ series_list = []
+ all_series = get_lable_list()
+ for each in all_series:
+ if each in all_info.upper():
+ series_list.append(each)
+ new_series_list = []
+ [new_series_list.append(i) for i in series_list if i and i not in new_series_list]
+ return ','.join(new_series_list)
+
if __name__ == '__main__':
# yapf: disable
diff --git a/src/models/crawlers/hscangku.py b/src/models/crawlers/hscangku.py
new file mode 100644
index 0000000..9b84373
--- /dev/null
+++ b/src/models/crawlers/hscangku.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import re
+import time
+
+import urllib3
+import zhconv
+from lxml import etree
+
+from models.base.web import curl_html
+from models.config.config import config
+from models.crawlers.guochan import get_number_list
+from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info
+
+urllib3.disable_warnings() # yapf: disable
+
+# import traceback
+
+def get_actor_photo(actor):
+ actor = actor.split(',')
+ data = {}
+ for i in actor:
+ actor_photo = {i: ''}
+ data.update(actor_photo)
+ return data
+
+
+def get_detail_info(html, real_url, number, file_path):
+ href = re.split(r'[/.]', real_url)[-2]
+ title_h1 = html.xpath('//h3[@class="title" and not(contains(normalize-space(.), "目录")) and not(contains(normalize-space(.), "为你推荐"))]/text()')
+ title = title_h1[0].replace(number + ' ', '').strip() if title_h1 else number
+ actor = get_extra_info(title, file_path, info_type="actor")
+ tag = get_extra_info(title, file_path, info_type="tag")
+ cover_url = html.xpath(f'//a[@data-original and contains(@href,"{href}")]/@data-original')
+ cover_url = cover_url[0] if cover_url else ''
+
+ return number, title, actor, cover_url, tag
+
+
+def get_real_url(html, number_list, hscangku_url):
+ item_list = html.xpath('//a[@class="stui-vodlist__thumb lazyload"]')
+ for each in item_list:
+ #href="/vodplay/41998-1-1.html"
+ detail_url = hscangku_url + each.get('href')
+ title = each.xpath('@title')[0]
+ if title and detail_url:
+ for n in number_list:
+ temp_n = re.sub(r'[\W_]', '', n).upper()
+ temp_title = re.sub(r'[\W_]', '', title).upper()
+ if temp_n in temp_title:
+ return True, n, title, detail_url
+ return False, '', '', ''
+
+def get_redirected_url(url):
+
+ result, response = curl_html(url)
+ if not result:
+ return None
+
+ if redirected_url := re.search(r'"(https?://.*?)"', response).group(1):
+ http = urllib3.PoolManager()
+ response = http.request('GET', f'{redirected_url}{url}&p=', redirect=False)
+ final_url = response.get_redirect_location()
+ return final_url if final_url else None
+ else:
+ return None
+
+def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path='', appoint_number=''):
+ start_time = time.time()
+ website_name = 'hscangku'
+ req_web += '-> %s' % website_name
+ title = ''
+ cover_url = ''
+ web_info = '\n '
+ log_info += ' \n 🌐 hscangku'
+ debug_info = ''
+ real_url = appoint_url
+ hscangku_url = getattr(config, 'hscangku_website', 'http://hsck.net')
+
+ try:
+ if not real_url:
+ # 处理番号
+ number_list, filename_list = get_number_list(number, appoint_number, file_path)
+ n_list = number_list[:1] + filename_list
+ #处理重定向
+ hscangku_url = get_redirected_url(hscangku_url)
+ if not hscangku_url:
+ debug_info = '没有正确的 hscangku_url,无法刮削'
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+ for each in n_list:
+ real_url = f'{hscangku_url}/vodsearch/-------------.html?wd={each}&submit='
+ # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit='
+ debug_info = f'请求地址: {real_url} '
+ log_info += web_info + debug_info
+ result, response = curl_html(real_url)
+
+ if not result:
+ debug_info = '网络请求错误: %s' % response
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+ search_page = etree.fromstring(response, etree.HTMLParser())
+ result, number, title, real_url = get_real_url(search_page, n_list, hscangku_url)
+ # real_url = 'http://hsck860.cc/vodsearch/-------------.html?wd=%E6%9F%9A%E5%AD%90%E7%8C%AB&submit='
+ if result:
+ break
+ else:
+ debug_info = '没有匹配的搜索结果'
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ debug_info = f'番号地址: {real_url} '
+ log_info += web_info + debug_info
+ result, response = curl_html(real_url)
+
+ if not result:
+ debug_info = '没有找到数据 %s ' % response
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ detail_page = etree.fromstring(response, etree.HTMLParser())
+ number, title, actor, cover_url, tag = get_detail_info(detail_page, real_url, number, file_path)
+ actor_photo = get_actor_photo(actor)
+
+ try:
+ dic = {
+ 'number': number,
+ 'title': title,
+ 'originaltitle': title,
+ 'actor': actor,
+ 'outline': '',
+ 'originalplot': '',
+ 'tag': tag,
+ 'release': '',
+ 'year': '',
+ 'runtime': '',
+ 'score': '',
+ 'series': '',
+ 'country': 'CN',
+ 'director': '',
+ 'studio': '',
+ 'publisher': '',
+ 'source': 'hscangku',
+ 'website': real_url,
+ 'actor_photo': actor_photo,
+ 'cover': cover_url,
+ 'poster': '',
+ 'extrafanart': '',
+ 'trailer': '',
+ 'image_download': False,
+ 'image_cut': 'no',
+ 'log_info': log_info,
+ 'error_info': '',
+ 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+ 'mosaic': '国产',
+ 'wanted': '',
+ }
+ debug_info = '数据获取成功!'
+ log_info += web_info + debug_info
+ dic['log_info'] = log_info
+ except Exception as e:
+ debug_info = '数据生成出错: %s' % str(e)
+ log_info += web_info + debug_info
+ raise Exception(debug_info)
+
+ except Exception as e:
+ # print(traceback.format_exc())
+ debug_info = str(e)
+ dic = {
+ 'title': '',
+ 'cover': '',
+ 'website': '',
+ 'log_info': log_info,
+ 'error_info': debug_info,
+ 'req_web': req_web + '(%ss) ' % (round((time.time() - start_time), )),
+ }
+ dic = {website_name: {'zh_cn': dic, 'zh_tw': dic, 'jp': dic}}
+ js = json.dumps(
+ dic,
+ ensure_ascii=False,
+ sort_keys=False,
+ indent=4,
+ separators=(',', ': '),
+ )
+ return js
+
+
+if __name__ == '__main__':
+ # yapf: disable
+ # print(main('GDCM-018'))
+ # print(main('国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度', file_path='国产一姐裸替演员沈樵Qualla作品.七旬老农的女鬼诱惑.国语原创爱片新高度'))
+ # print(main('RS001', file_path='RS-001.红斯灯影像.REDSTEN.淫白大胜利.上.男女水中竞赛.败方被强制插入高潮连连'))
+ # print(main('MD-0269', file_path='MD-0269.梁佳芯.唐芯.换妻性爱淫元宵.正月十五操骚鲍.麻豆传媒映画原创中文原版收藏'))
+ # print(main('sh-006', file_path='SH-006.谢冰岚.神屌侠侣.是谁操了我的小龙女.涩会传媒'))
+ # print(main('PMC-085', file_path='PMC/PMC-085.雪霏.出差借宿小姨子乱伦姐夫.特别照顾的肉体答谢.蜜桃影像传媒.ts'))
+ # print(main('TM-0165', file_path='TM0165.王小妮.妈妈的性奴之路.性感少妇被儿子和同学调教成性奴.天美传媒'))
+ # print(main('mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒'))
+ # print(main('mini06', file_path='mini06.全裸家政.只為弟弟的學費打工.被玩弄的淫亂家政小妹.mini傳媒'))
+ # print(main('mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒', file_path='mini06.全裸家政.只为弟弟的学费打工.被玩弄的淫乱家政小妹.mini传媒'))
+ # print(main('XSJ138', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品'))
+ # print(main('DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌', file_path='DW-006.AV帝王作品.Roxie出演.地方妈妈的性解放.双穴双屌'))
+ # print(main('MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作', file_path='MDJ001-EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.2021麻豆最强跨国合作'))
+ # print(main('MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列', file_path='MKY-TN-003.周宁.乱伦黑料流出.最喜欢爸爸的鸡巴了.麻豆传媒MKY系列'))
+ # print(main('XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品', file_path='XSJ138.养子的秘密教学EP6.薇安姐内射教学.性视界出品'))
+ # print(main('大像传媒之淫蕩刺青女學徒', file_path='大像传媒之淫蕩刺青女學徒'))
+ # print(main('冠希传媒GX-017强上弟弟的巨乳姐姐', file_path='冠希传媒GX-017强上弟弟的巨乳姐姐'))
+ # print(main('[SWAG]XHX-0014宅男的公仔幻化成人', file_path='[SWAG]XHX-0014宅男的公仔幻化成人'))
+ # print(main('IDG5401'))
+ print(main('大像传媒之長腿癡女代表情慾作-米歐', file_path='大像传媒之長腿癡女代表情慾作-米歐'))
+ # print(main('MDX-0016'))
+ # print(main('MDSJ-0004'))
+ # print(main('RS-020'))
+ # print(main('PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒', file_path='PME-018.雪霏.禽兽小叔迷奸大嫂.性感身材任我玩弄.蜜桃影像传媒'))
+ # print(main('老公在外出差家里的娇妻被入室小偷强迫性交 - 美酱'))
+ # print(main('', file_path='夏日回忆 贰 HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4'))
+ # print(main('', file_path='HongKongDoll玩偶姐姐.短篇集.夏日回忆 贰.Summer Memories.Part 2.mp4'))
+ # print(main('', file_path="【HongKongDoll玩偶姐姐.短篇集.情人节特辑.Valentine's Day Special-cd2"))
+ # print(main('', file_path='PMC-062 唐茜.綠帽丈夫連同新弟怒操出軌老婆.強拍淫蕩老婆被操 唐茜.ts'))
+ # print(main('', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画'))
+ # print(main('淫欲游戏王.EP6', appoint_number='淫欲游戏王.EP5', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')) # EP不带.才能搜到
+ # print(main('', file_path='PMS-003.职场冰与火.EP3设局.宁静.苏文文.设局我要女人都臣服在我胯下.蜜桃影像传媒'))
+ # print(main('', file_path='PMS-001 性爱公寓EP04 仨人.蜜桃影像传媒.ts'))
+ # print(main('', file_path='PMS-001.性爱公寓EP03.ts'))
+ # print(main('', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.ts'))
+ # print(main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts'))
+ # main('', file_path='淫欲游戏王.EP6.情欲射龙门.性爱篇.郭童童.李娜.双英战龙根3P混战.麻豆传媒映画.ts')
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱')) # 简体搜不到
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木结爱.TS'))
+ # '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛', '麻豆傳媒映畫原版 兔子先生 拉麵店搭訕超可愛少女下-', ' 兔子先生 拉麵店搭訕超可愛少女下-柚木結愛']
+ # print(main('', file_path='麻豆傳媒映畫原版 兔子先生 我的女友是女優 女友是AV女優是怎樣的體驗-美雪樱.TS'))
+ # print(main('', file_path='PMS-001 性爱公寓EP02 女王 蜜桃影像传媒 -莉娜乔安.TS'))
+ # print(main('91CM-081', file_path='91CM-081.田恬.李琼.继母与女儿.三.爸爸不在家先上妹妹再玩弄母亲.果冻传媒.mp4'))
+ # print(main('91CM-081', file_path='MDJ-0001.EP3.陈美惠.淫兽寄宿家庭.我和日本父子淫乱的一天.麻豆传媒映画.mp4'))
+ # print(main('91CM-081', file_path='MDJ0001 EP2 AV 淫兽鬼父 陈美惠 .TS'))
+ # print(main('91CM-081', file_path='MXJ-0005.EP1.弥生美月.小恶魔高校生.与老师共度的放浪补课.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MKY-HS-004.周寗.催情民宿.偷下春药3P干爆夫妇.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='PH-US-002.色控.音乐老师全裸诱惑.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDX-0236-02.沈娜娜.青梅竹马淫乱3P.麻豆传媒映画x逼哩逼哩blibli.TS'))
+ # print(main('91CM-081', file_path='MD-0140-2.蜜苏.家有性事EP2.爱在身边.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDUS系列[中文字幕].LAX0025.性感尤物渴望激情猛操.RUCK ME LIKE A SEX DOLL.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='REAL野性派001-朋友的女友讓我最上火.TS'))
+ # print(main('91CM-081', file_path='MDS-009.张芸熙.巨乳旗袍诱惑.搔首弄姿色气满点.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDS005 被雇主强上的熟女家政妇 大声呻吟被操到高潮 杜冰若.mp4.TS'))
+ # print(main('91CM-081', file_path='TT-005.孟若羽.F罩杯性感巨乳DJ.麻豆出品x宫美娱乐.TS'))
+ # print(main('91CM-081', file_path='台湾第一女优吴梦梦.OL误上痴汉地铁.惨遭多人轮番奸玩.麻豆传媒映画代理出品.TS'))
+ # print(main('91CM-081', file_path='PsychoPorn色控.找来大奶姐姐帮我乳交.麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='鲍鱼游戏SquirtGame.吸舔碰糖.失败者屈辱凌辱.TS'))
+ # print(main('91CM-081', file_path='导演系列 外卖员的色情体验 麻豆传媒映画.TS'))
+ # print(main('91CM-081', file_path='MDS007 骚逼女友在作妖-硬上男友当玩具 叶一涵.TS'))
+ # print(main('MDM-002')) # 去掉标题最后的发行商
+ # print(main('MDS-007')) # 数字要四位才能搜索到,即 MDS-0007 MDJ001 EP1 我的女优物语陈美惠.TS
+ # print(main('MDS-007', file_path='MDJ001 EP1 我的女优物语陈美惠.TS')) # 数字要四位才能搜索到,即 MDJ-0001.EP1
+ # print(main('91CM-090')) # 带横线才能搜到
+ # print(main('台湾SWAG chloebabe 剩蛋特辑 干爆小鹿')) # 带空格才能搜到
+ # print(main('淫欲游戏王EP2')) # 不带空格才能搜到
+ # print(main('台湾SWAG-chloebabe-剩蛋特輯-幹爆小鹿'))
+ # print(main('MD-0020'))
+ # print(main('mds009'))
+ # print(main('mds02209'))
+ # print(main('女王的SM调教'))
+ # print(main('91CM202'))
+ # print(main('91CM-202'))
diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py
index 33c2b92..57e9ad2 100644
--- a/src/models/crawlers/madouqu.py
+++ b/src/models/crawlers/madouqu.py
@@ -9,13 +9,14 @@
from models.base.web import curl_html
from models.crawlers.guochan import get_number_list
+from models.config.config import config
+from models.crawlers.guochan import get_actor_list, get_lable_list,get_extra_info
urllib3.disable_warnings() # yapf: disable
# import traceback
-
def get_actor_photo(actor):
actor = actor.split(',')
data = {}
@@ -25,7 +26,7 @@ def get_actor_photo(actor):
return data
-def get_detail_info(html, number):
+def get_detail_info(html, number, file_path):
detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//p//text()')
# detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//text()')
title_h1 = html.xpath('//div[@class="cao_entry_header"]/header/h1/text()')
@@ -48,6 +49,7 @@ def get_detail_info(html, number):
cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src')
cover_url = cover_url[0] if cover_url else ''
# print(number, title, actor, cover_url, studio, detail_info)
+ actor = get_extra_info(title, file_path, info_type="actor") if actor == '' else actor
return number, title, actor, cover_url, studio
@@ -55,7 +57,8 @@ def get_real_url(html, number_list):
item_list = html.xpath('//div[@class="entry-media"]/div/a')
for each in item_list:
detail_url = each.get('href')
- title = each.xpath('img[@class="lazyload"]/@alt')[0]
+ # lazyload属性容易改变,去掉也能拿到结果
+ title = each.xpath('img[@class]/@alt')[0]
if title and detail_url:
for n in number_list:
temp_n = re.sub(r'[\W_]', '', n).upper()
@@ -75,6 +78,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file
log_info += ' \n 🌐 madouqu'
debug_info = ''
real_url = appoint_url
+ madouqu_url = getattr(config, 'madouqu_website', False)
try:
if not real_url:
@@ -82,7 +86,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file
number_list, filename_list = get_number_list(number, appoint_number, file_path)
n_list = number_list[:1] + filename_list
for each in n_list:
- real_url = f'https://madouqu.com/?s={each}'
+ real_url = f'{madouqu_url}/?s={each}' if madouqu_url else f'https://madouqu.com/?s={each}'
# real_url = 'https://madouqu.com/?s=XSJ-138.%E5%85%BB%E5%AD%90%E7%9A%84%E7%A7%98%E5%AF%86%E6%95%99%E5%AD%A6EP6'
debug_info = f'请求地址: {real_url} '
log_info += web_info + debug_info
@@ -111,7 +115,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file
raise Exception(debug_info)
detail_page = etree.fromstring(response, etree.HTMLParser())
- number, title, actor, cover_url, studio = get_detail_info(detail_page, number)
+ number, title, actor, cover_url, studio = get_detail_info(detail_page, number, file_path)
actor_photo = get_actor_photo(actor)
try:
diff --git a/src/models/crawlers/mdtv.py b/src/models/crawlers/mdtv.py
index 94efea8..511b861 100644
--- a/src/models/crawlers/mdtv.py
+++ b/src/models/crawlers/mdtv.py
@@ -40,6 +40,8 @@ def get_some_info(html, title, file_path):
# 未找到演员时,看热门演员是否在标题和各种信息里
series = series_list[0] if series_list else ''
tag = ','.join(tag_list)
+ actor_fake_name = any('未知' in item for item in actor_list)
+ actor_list = [] if actor_fake_name else actor_list
if not actor_list:
all_info = title + series + tag + file_path
all_actor = get_actor_list()
diff --git a/src/models/crawlers/mmtv.py b/src/models/crawlers/mmtv.py
index 0e7e143..bd51592 100644
--- a/src/models/crawlers/mmtv.py
+++ b/src/models/crawlers/mmtv.py
@@ -9,6 +9,8 @@
from models.base.number import is_uncensored
from models.base.web import curl_html
+from models.config.config import config
+from models.crawlers.guochan import get_actor_list, get_lable_list
urllib3.disable_warnings() # yapf: disable
@@ -30,15 +32,53 @@ def get_title(html, web_number):
return result[0].replace(web_number, '').strip() if result else ''
-def get_actor(html):
+def get_actor(html, title, file_path):
actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()')
actor = ''
if actor_list:
for each in actor_list:
'''愛澄玲花,日高ゆりあ(青山ひより) 菜津子 32歳 デザイナー'''
actor += re.sub(r'(.+)', '', each).split(' ')[0] + ','
+ else:
+ actor = get_some_info(title, file_path, info_type="actor")
return actor.strip(',')
+def get_some_info(title, file_path, info_type, tag='', actor='', series=''):
+
+ all_info = title + file_path + tag + actor + series
+
+ # 未找到标签时,从各种信息里匹配
+ if info_type == "tag":
+ tag_list = []
+ all_tag = get_lable_list()
+ for each in all_tag:
+ if each in all_info:
+ tag_list.append(each)
+ new_tag_list = []
+ [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list]
+ return ','.join(new_tag_list)
+
+ # 未找到演员时,看热门演员是否在标题和各种信息里
+ if info_type == "actor":
+ actor_list = []
+ all_actor = get_actor_list()
+ for each in all_actor:
+ if each in all_info:
+ actor_list.append(each)
+ new_actor_list = []
+ [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list]
+ return ','.join(new_actor_list)
+
+ # 未找到系列时,从各种信息里匹配
+ if info_type == "series":
+ series_list = []
+ all_series = get_lable_list()
+ for each in all_series:
+ if each in all_info:
+ series_list.append(each)
+ new_series_list = []
+ [new_series_list.append(i) for i in series_list if i and i not in new_series_list]
+ return ','.join(new_series_list)
def get_real_url(html, number):
result = html.xpath('//figure[@class="video-preview"]/a')
@@ -134,7 +174,12 @@ def get_tag(html):
def get_extrafanart(html):
- result = html.xpath('//a[@class="screens-item fresco"]/@href')
+ # 前几张
+ result1 = html.xpath('//span/img[contains(@class, "lazyload")]/@data-src')
+ # 其他隐藏需点击的
+ if result2 := html.xpath('//div[contains(@class, "fullvideo")]/script[@language="javascript"]/text()'):
+ result2 = re.findall(r'https?://.+?\.jpe?g', str(result2))
+ result = result1 + result2
return result if result else ''
@@ -166,7 +211,7 @@ def get_number(html, number):
return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number
-def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
+def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''):
start_time = time.time()
website_name = '7mmtv'
req_web += '-> %s' % website_name
@@ -175,9 +220,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
web_info = '\n '
log_info += ' \n 🌐 7mmtv'
debug_info = ''
+ mmtv_url = 'https://www.7mmtv.sx'
+ if hasattr(config, '7mmtv_website'):
+ mmtv_url = getattr(config, '7mmtv_website')
real_url = appoint_url
# search_url = "https://bb9711.com/zh/searchform_search/all/index.html"
- search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
+ # search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
+ search_url = f"{mmtv_url}/zh/searchform_search/all/index.html"
mosaic = ''
try:
@@ -186,7 +235,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
if number.upper().startswith('FC2'):
search_keyword = re.findall(r'\d{3,}', number)[0]
- search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search'
+ search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search'
debug_info = f'搜索地址: {search_url} '
log_info += web_info + debug_info
result, response = curl_html(search_url)
@@ -220,7 +269,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
debug_info = '数据获取失败: 未获取到title!'
log_info += web_info + debug_info
raise Exception(debug_info)
- actor = get_actor(html_info)
+ actor = get_actor(html_info, title, file_path)
actor_photo = get_actor_photo(actor)
cover_url = get_cover(html_content)
outline, originalplot = get_outline(html_info)
@@ -245,7 +294,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
'runtime': runtime,
'score': '',
'series': '',
- 'country': 'JP',
+ 'country': 'CN',
'director': director,
'studio': studio,
'publisher': publisher,
@@ -306,7 +355,8 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
# print(main('H4610-ki230225'))
# print(main('c0930-ki221218'))
# print(main('c0930-hitozuma1407'))
- print(main('h0930-ori1665'))
+ #print(main('h0930-ori1665'))
+ print(main('h0930-ori1665', appoint_url='https://7mm002.com/zh/amateur_content/107108/content.html'))
# print(main('RBD-293'))
# print(main('LUXU-728')) # 无结果
# print(main('fc2-1050737')) # 标题中有/