Skip to content

Commit

Permalink
Fix: mmtv custom website;try more actor matching rules
Browse files Browse the repository at this point in the history
  • Loading branch information
runoob11 committed Feb 1, 2024
1 parent fc88133 commit e38989d
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/models/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def _call_crawler(json_data, website, language, file_number, short_number, mosai
elif website == 'mgstage':
json_data = json.loads(mgstage.main(file_number, appoint_url, log_info, req_web, language, short_number))
elif website == '7mmtv':
json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language))
json_data = json.loads(mmtv.main(file_number, appoint_url, log_info, req_web, language, file_path))
elif website == 'fc2':
json_data = json.loads(fc2.main(file_number, appoint_url, log_info, req_web, language))
elif website == 'fc2hub':
Expand Down
56 changes: 50 additions & 6 deletions src/models/crawlers/mmtv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from models.base.number import is_uncensored
from models.base.web import curl_html
from models.config.config import config
from models.crawlers.guochan import get_actor_list, get_lable_list

urllib3.disable_warnings() # yapf: disable

Expand All @@ -30,15 +32,53 @@ def get_title(html, web_number):
return result[0].replace(web_number, '').strip() if result else ''


def get_actor(html):
def get_actor(html, title, file_path):
actor_list = html.xpath('//div[@class="fullvideo-idol"]/span/a/text()')
actor = ''
if actor_list:
for each in actor_list:
'''愛澄玲花,日高ゆりあ(青山ひより) 菜津子 32歳 デザイナー'''
actor += re.sub(r'(.+)', '', each).split(' ')[0] + ','
else:
actor = get_some_info(title, file_path, info_type="actor")
return actor.strip(',')

def get_some_info(title, file_path, info_type, tag='', actor='', series=''):

all_info = title + file_path + tag + actor + series

# 未找到标签时,从各种信息里匹配
if info_type == "tag":
tag_list = []
all_tag = get_lable_list()
for each in all_tag:
if each in all_info:
tag_list.append(each)
new_tag_list = []
[new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list]
return ','.join(new_tag_list)

# 未找到演员时,看热门演员是否在标题和各种信息里
if info_type == "actor":
actor_list = []
all_actor = get_actor_list()
for each in all_actor:
if each in all_info:
actor_list.append(each)
new_actor_list = []
[new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list]
return ','.join(new_actor_list)

# 未找到系列时,从各种信息里匹配
if info_type == "series":
series_list = []
all_series = get_lable_list()
for each in all_series:
if each in all_info:
series_list.append(each)
new_series_list = []
[new_series_list.append(i) for i in series_list if i and i not in new_series_list]
return ','.join(new_series_list)

def get_real_url(html, number):
result = html.xpath('//figure[@class="video-preview"]/a')
Expand Down Expand Up @@ -134,7 +174,7 @@ def get_tag(html):


def get_extrafanart(html):
result = html.xpath('//a[@class="screens-item fresco"]/@href')
result = html.xpath('//a[@class="lazyload screens-item fresco"]/@href')
return result if result else ''


Expand Down Expand Up @@ -166,7 +206,7 @@ def get_number(html, number):
return number.replace('FC2-PPV ', 'FC2-'), release, runtime, number


def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file_path=''):
start_time = time.time()
website_name = '7mmtv'
req_web += '-> %s' % website_name
Expand All @@ -175,9 +215,13 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
web_info = '\n '
log_info += ' \n 🌐 7mmtv'
debug_info = ''
mmtv_url = 'https://www.7mmtv.sx'
if hasattr(config, '7mmtv_website'):
mmtv_url = getattr(config, '7mmtv_website')
real_url = appoint_url
# search_url = "https://bb9711.com/zh/searchform_search/all/index.html"
search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
# search_url = "https://7mmtv.sx/zh/searchform_search/all/index.html"
search_url = f"{mmtv_url}/zh/searchform_search/all/index.html"
mosaic = ''

try:
Expand All @@ -186,7 +230,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
if number.upper().startswith('FC2'):
search_keyword = re.findall(r'\d{3,}', number)[0]

search_url = f'https://7mmtv.sx/zh/searchform_search/all/index.html?search_keyword={search_keyword}&search_type=searchall&op=search'
search_url = f'{search_url}?search_keyword={search_keyword}&search_type=searchall&op=search'
debug_info = f'搜索地址: {search_url} '
log_info += web_info + debug_info
result, response = curl_html(search_url)
Expand Down Expand Up @@ -220,7 +264,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn'):
debug_info = '数据获取失败: 未获取到title!'
log_info += web_info + debug_info
raise Exception(debug_info)
actor = get_actor(html_info)
actor = get_actor(html_info, title, file_path)
actor_photo = get_actor_photo(actor)
cover_url = get_cover(html_content)
outline, originalplot = get_outline(html_info)
Expand Down

0 comments on commit e38989d

Please sign in to comment.