-
-
Notifications
You must be signed in to change notification settings - Fork 305
/
webnovel.py
186 lines (157 loc) · 6.67 KB
/
webnovel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
import logging
import re
from time import time
from urllib.parse import urlencode, urlparse
from bs4 import BeautifulSoup
from lncrawl.core.exeptions import FallbackToBrowser
from lncrawl.models import Chapter, SearchResult
from lncrawl.models.volume import Volume
from lncrawl.templates.browser.basic import BasicBrowserTemplate
from lncrawl.webdriver.elements import By
logger = logging.getLogger(__name__)
class WebnovelCrawler(BasicBrowserTemplate):
base_url = [
"https://m.webnovel.com/",
"https://www.webnovel.com/",
]
def initialize(self) -> None:
self.headless = True
self.home_url = "https://www.webnovel.com/"
bad_text = [
r"(\<pirate\>(.*?)\<\/pirate\>)"
r"(Find authorized novels in Webnovel(.*)for visiting\.)",
]
self.re_cleaner = re.compile("|".join(bad_text), re.M)
def get_csrf(self):
logger.info("Getting CSRF Token")
self.get_response(self.home_url)
self.csrf = self.cookies["_csrfToken"]
logger.debug("CSRF Token = %s", self.csrf)
def search_novel_in_scraper(self, query: str):
self.get_csrf()
params = {
"_csrfToken": self.csrf,
"pageIndex": 1,
"encryptType": 3,
"_fsae": 0,
"keywords": query,
}
data = self.get_json(f"{self.home_url}go/pcm/search/result?{urlencode(params)}")
for book in data["data"]["bookInfo"]["bookItems"]:
yield SearchResult(
title=book["bookName"],
url=f"{self.home_url}book/{book['bookId']}",
info="%(categoryName)s | Score: %(totalScore)s" % book,
)
def search_novel_in_browser(self, query: str):
params = {"keywords": query}
self.visit(f"{self.home_url}search?{urlencode(params)}")
self.last_soup_url = self.browser.current_url
for li in self.browser.soup.select(".search-result-container li"):
a = li.find("a")
yield SearchResult(
url=self.absolute_url(a.get("href")),
title=a.get("data-bookname"),
info=li.find(".g_star_num small").text.strip(),
)
def read_novel_info_in_scraper(self):
self.get_csrf()
url = self.novel_url
if "_" not in url:
ids = re.findall(r"/book/(\d+)", url)
assert ids, "Please enter a correct novel URL"
self.novel_id = ids[0]
else:
self.novel_id = url.split("_")[1]
logger.info("Novel Id: %s", self.novel_id)
response = self.get_response(
f"{self.home_url}go/pcm/chapter/getContent"
+ f"?_csrfToken={self.csrf}&bookId={self.novel_id}&chapterId=0"
+ "&encryptType=3&_fsae=0"
)
data = response.json()
logger.debug("Book Response:\n%s", data)
assert "data" in data, "Data not found"
data = data["data"]
assert "bookInfo" in data, "Book info not found"
book_info = data["bookInfo"]
assert "bookName" in book_info, "Book name not found"
self.novel_title = book_info["bookName"]
self.novel_cover = (
f"{self.origin.scheme}://img.webnovel.com/bookcover/{self.novel_id}/600/600.jpg"
+ f"?coverUpdateTime{int(1000 * time())}&imageMogr2/quality/40"
)
if "authorName" in book_info:
self.novel_author = book_info["authorName"]
elif "authorItems" in book_info:
self.novel_author = ", ".join(
[x.get("name") for x in book_info["authorItems"] if x.get("name")]
)
# To get the chapter list catalog
soup = self.get_soup(f"{self.novel_url.strip('/')}/catalog")
self.parse_chapter_catalog(soup)
if not self.chapters:
raise FallbackToBrowser()
def read_novel_info_in_browser(self) -> None:
path = urlparse(self.novel_url).path.strip("/")
self.visit(f"{self.home_url}{path}/catalog")
self.last_soup_url = self.browser.current_url
self.browser.wait(".j_catalog_list")
self.parse_chapter_catalog(self.browser.soup)
def parse_chapter_catalog(self, soup: BeautifulSoup) -> None:
for div in soup.select(".j_catalog_list .volume-item"):
vol = Volume(
id=len(self.volumes) + 1,
title=div.find("h4").text.strip(),
)
self.volumes.append(vol)
for li in div.select("li"):
a = li.find("a")
chap = Chapter(
id=len(self.chapters) + 1,
volume=vol.id,
title=a.get("title"),
cid=li.get("data-report-cid"),
url=self.absolute_url(a.get("href")),
)
self.chapters.append(chap)
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
path = urlparse(chapter.url).path.strip("/")
self.visit(f"{self.home_url}{path}")
self.browser.wait(f"j_chapter_{chapter.cid}", By.CLASS_NAME)
body = ""
for p in self.browser.soup.select(f".j_chapter_{chapter.cid} .cha-paragraph p"):
body += str(p)
return body
def download_chapter_body_in_scraper(self, chapter: Chapter) -> str:
logger.info("Chapter Id: %s", chapter.cid)
response = self.get_response(
f"{self.home_url}go/pcm/chapter/getContent?encryptType=3&_fsae=0"
+ f"&_csrfToken={self.csrf}&bookId={self.novel_id}&chapterId={chapter.cid}"
)
data = response.json()
logger.debug("Chapter Response:\n%s", data)
assert "data" in data, "Data not found"
data = data["data"]
assert "chapterInfo" in data, "Chapter Info not found"
chapter_info = data["chapterInfo"]
chapter.title = chapter_info["chapterName"] or f"Chapter #{chapter.id}"
if "content" in chapter_info:
return self._format_content(chapter_info["content"])
if "contents" in chapter_info:
body = [
self._format_content(x["content"])
for x in chapter_info["contents"]
if "content" in x
]
return "".join([x for x in body if x.strip()])
def _format_content(self, text: str):
if ("<p>" not in text) or ("</p>" not in text):
text = "".join(text.split("\r"))
text = "<".join(text.split("<"))
text = ">".join(text.split(">"))
text = [x.strip() for x in text.split("\n") if x.strip()]
text = "<p>" + "</p><p>".join(text) + "</p>"
text = self.re_cleaner.sub("", text)
return text.strip()