diff --git a/src/models/crawlers/madouqu.py b/src/models/crawlers/madouqu.py index 33c2b92..4441843 100644 --- a/src/models/crawlers/madouqu.py +++ b/src/models/crawlers/madouqu.py @@ -9,12 +9,50 @@ from models.base.web import curl_html from models.crawlers.guochan import get_number_list +from models.config.config import config +from models.crawlers.guochan import get_actor_list, get_lable_list urllib3.disable_warnings() # yapf: disable # import traceback +def get_some_info(title, file_path, info_type, tag='', actor='', series=''): + + all_info = title + file_path + tag + actor + series + + # 未找到标签时,从各种信息里匹配 + if info_type == "tag": + tag_list = [] + all_tag = get_lable_list() + for each in all_tag: + if each in all_info: + tag_list.append(each) + new_tag_list = [] + [new_tag_list.append(i) for i in tag_list if i and i not in new_tag_list] + return ','.join(new_tag_list) + + # 未找到演员时,看热门演员是否在标题和各种信息里 + if info_type == "actor": + actor_list = [] + all_actor = get_actor_list() + for each in all_actor: + if each in all_info: + actor_list.append(each) + new_actor_list = [] + [new_actor_list.append(i) for i in actor_list if i and i not in new_actor_list] + return ','.join(new_actor_list) + + # 未找到系列时,从各种信息里匹配 + if info_type == "series": + series_list = [] + all_series = get_lable_list() + for each in all_series: + if each in all_info: + series_list.append(each) + new_series_list = [] + [new_series_list.append(i) for i in series_list if i and i not in new_series_list] + return ','.join(new_series_list) def get_actor_photo(actor): actor = actor.split(',') @@ -25,7 +63,7 @@ def get_actor_photo(actor): return data -def get_detail_info(html, number): +def get_detail_info(html, number, file_path): detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//p//text()') # detail_info = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]//text()') title_h1 = html.xpath('//div[@class="cao_entry_header"]/header/h1/text()') @@ -48,6 +86,7 @@ def get_detail_info(html, number): cover_url = html.xpath('//div[@class="entry-content u-text-format u-clearfix"]/p/img/@src') cover_url = cover_url[0] if cover_url else '' # print(number, title, actor, cover_url, studio, detail_info) + actor = get_some_info(title, file_path, info_type="actor") if actor == '' else actor return number, title, actor, cover_url, studio @@ -75,6 +114,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file log_info += ' \n 🌐 madouqu' debug_info = '' real_url = appoint_url + madouqu_url = getattr(config, 'madouqu_website', False) try: if not real_url: @@ -82,7 +122,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file number_list, filename_list = get_number_list(number, appoint_number, file_path) n_list = number_list[:1] + filename_list for each in n_list: - real_url = f'https://madouqu.com/?s={each}' + real_url = f'{madouqu_url}/?s={each}' if madouqu_url else f'https://madouqu.com/?s={each}' # real_url = 'https://madouqu.com/?s=XSJ-138.%E5%85%BB%E5%AD%90%E7%9A%84%E7%A7%98%E5%AF%86%E6%95%99%E5%AD%A6EP6' debug_info = f'请求地址: {real_url} ' log_info += web_info + debug_info @@ -111,7 +151,7 @@ def main(number, appoint_url='', log_info='', req_web='', language='zh_cn', file raise Exception(debug_info) detail_page = etree.fromstring(response, etree.HTMLParser()) - number, title, actor, cover_url, studio = get_detail_info(detail_page, number) + number, title, actor, cover_url, studio = get_detail_info(detail_page, number, file_path) actor_photo = get_actor_photo(actor) try: