-
-
Notifications
You must be signed in to change notification settings - Fork 305
/
creativenovels.py
140 lines (118 loc) · 4.71 KB
/
creativenovels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
import logging
import re
from urllib.parse import parse_qs, urlparse
from bs4 import Tag
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
chapter_list_url = "https://creativenovels.com/wp-admin/admin-ajax.php"
chapter_s_regex = r'var chapter_list_summon = {"ajaxurl":"https:\/\/creativenovels.com\/wp-admin\/admin-ajax.php","security":"([^"]+)"}'
class CreativeNovelsCrawler(Crawler):
base_url = "https://creativenovels.com/"
def initialize(self) -> None:
self.cleaner.bad_css.update(
[
".announcements_crn",
'span[style*="color:transparent"]',
"div.novel_showcase",
]
)
def read_novel_info(self):
# self.novel_id = re.findall(r'\/\d+\/', self.novel_url)[0]
# self.novel_id = int(self.novel_id.strip('/'))
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)
shortlink = soup.find("link", {"rel": "shortlink"})["href"]
self.novel_id = parse_qs(urlparse(shortlink).query)["p"][0]
logger.info("Id: %s", self.novel_id)
possible_title = soup.select_one("head title")
assert possible_title, "No novel title"
self.novel_title = possible_title.text
self.novel_title = self.novel_title.split("–")[0].strip()
logger.info("Novel title: %s", self.novel_title)
possible_image = soup.select_one("img.book_cover")
if possible_image:
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel Cover: %s", self.novel_cover)
for div in soup.select(".x-bar-content .x-text.bK_C"):
text = div.text.strip()
if re.search("author|translator", text, re.I):
self.novel_author = text
break
logger.info(self.novel_author)
list_security_key = ""
for script in soup.select("script"):
text = script.string
if not text or "var chapter_list_summon" not in text:
continue
p = re.findall(r'"([^"]+)"', text)
if (
p[0] == "ajaxurl"
and p[1] == "https:\\/\\/creativenovels.com\\/wp-admin\\/admin-ajax.php"
):
if p[2] == "security":
list_security_key = p[3]
logger.debug("Chapter list security = %s", list_security_key)
response = self.submit_form(
chapter_list_url,
data=dict(
action="crn_chapter_list", view_id=self.novel_id, s=list_security_key
),
)
content = response.content.decode("utf8")
if not content.startswith("success"):
return
content = content[len("success.define.") :]
for data in content.split(".end_data."):
parts = data.split(".data.")
if len(parts) < 2:
continue
ch_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if vol_id > len(self.volumes):
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": ch_id,
"volume": vol_id,
"url": parts[0],
"title": parts[1],
}
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
FORMATTING_TAGS = [
"b",
"i",
"strong",
"small",
"em",
"mark",
"ins",
"sub",
"sup",
"br",
]
body = soup.select_one("article .entry-content")
assert isinstance(body, Tag)
for span in body.find_all("span"):
if len(span.parent.contents) <= 3:
if (span.parent.name in FORMATTING_TAGS) or (
span.next_sibling or span.previous_sibling
):
if span.next_sibling:
if span.next_sibling.name == FORMATTING_TAGS:
span.replace_with(span.text)
elif span.previous_sibling:
if span.previous_sibling.name == FORMATTING_TAGS:
span.replace_with(span.text)
# If its parent is a formatting tag: Just remove the span tag
span.replace_with(span.text)
else:
# Else: change it into a paragraph
span.name = "p"
span.attrs = {}
else:
span.name = "p"
span.attrs = {}
return self.cleaner.extract_contents(body)