Skip to content

Commit

Permalink
Fix: madouqu custom website;more actor matching attempts
Browse files Browse the repository at this point in the history
  • Loading branch information
runoob11 committed Feb 1, 2024
1 parent 192c7e0 commit cfa96c8
Showing 1 changed file with 43 additions and 3 deletions.
46 changes: 43 additions & 3 deletions src/models/crawlers/madouqu.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,50 @@

from models.base.web import curl_html
from models.crawlers.guochan import get_number_list
from models.config.config import config
from models.crawlers.guochan import get_actor_list, get_lable_list

urllib3.disable_warnings() # yapf: disable


# import traceback

def get_some_info(title, file_path, info_type, tag='', actor='', series=''):

all_info = title + file_path + tag + actor + series

# 未找到标签时,从各种信息里匹配
if info_type == "tag":
tag_list = []
all_tag = get_lable_list()
for each in all_tag:
if each in all_info:
tag_list.append(each)
new_tag_list = []
[new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list]
return ','.join(new_tag_list)

# 未找到演员时,看热门演员是否在标题和各种信息里
if info_type == "actor":
actor_list = []
all_actor = get_actor_list()
for each in all_actor:
if each in all_info:
actor_list.append(each)
new_actor_list = []
[new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list]
return ','.join(new_actor_list)

# 未找到系列时,从各种信息里匹配
if info_type == "series":
series_list = []
all_series = get_lable_list()
for each in all_series:
if each in all_info:
series_list.append(each)
new_series_list = []
[new_series_list.append(i) for i in series_list if i and i not in new_series_list]
return ','.join(new_series_list)

def get_actor_photo(actor):
actor = actor.split(',')
Expand All @@ -25,7 +63,7 @@ def get_actor_photo(actor):
return data


def get_detail_info(html, number):
def get_detail_info(html, number, file_path):
detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//p//text()')
# detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//text()')
title_h1 = html.xpath('//div[@class="cao_entry_header"]/header/h1/text()')
Expand All @@ -48,6 +86,7 @@ def get_detail_info(html, number):
cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src')
cover_url = cover_url[0] if cover_url else ''
# print(number, title, actor, cover_url, studio, detail_info)
actor = get_some_info(title, file_path, info_type="actor") if actor == '' else actor
return number, title, actor, cover_url, studio


Expand Down Expand Up @@ -75,14 +114,15 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file
log_info += ' \n 🌐 madouqu'
debug_info = ''
real_url = appoint_url
madouqu_url = getattr(config, 'madouqu_website', False)

try:
if not real_url:
# 处理番号
number_list, filename_list = get_number_list(number, appoint_number, file_path)
n_list = number_list[:1] + filename_list
for each in n_list:
real_url = f'https://madouqu.com/?s={each}'
real_url = f'{madouqu_url}/?s={each}' if madouqu_url else f'https://madouqu.com/?s={each}'
# real_url = 'https://madouqu.com/?s=XSJ-138.%E5%85%BB%E5%AD%90%E7%9A%84%E7%A7%98%E5%AF%86%E6%95%99%E5%AD%A6EP6'
debug_info = f'请求地址: {real_url} '
log_info += web_info + debug_info
Expand Down Expand Up @@ -111,7 +151,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file
raise Exception(debug_info)

detail_page = etree.fromstring(response, etree.HTMLParser())
number, title, actor, cover_url, studio = get_detail_info(detail_page, number)
number, title, actor, cover_url, studio = get_detail_info(detail_page, number, file_path)
actor_photo = get_actor_photo(actor)

try:
Expand Down

0 comments on commit cfa96c8

Please sign in to comment.