-
-
Notifications
You must be signed in to change notification settings - Fork 305
/
webnovelover.py
106 lines (87 loc) · 3.75 KB
/
webnovelover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
import logging
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
search_url = "https://www.webnovelover.com/?s=%s&post_type=wp-manga&op=&author=&artist=&release=&adult="
post_chapter_url = "https://www.webnovelover.com/wp-admin/admin-ajax.php"
class WebNoveLover(Crawler):
base_url = "https://www.webnovelover.com/"
# NOTE: Site search doesn't work. So this won't work.
def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)
results = []
for tab in soup.select(".c-tabs-item__content"):
a = tab.select_one(".post-title h3 a")
latest = tab.select_one(".latest-chap .chapter a").text
votes = tab.select_one(".rating .total_votes").text
results.append(
{
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
"info": "%s | Rating: %s" % (latest, votes),
}
)
return results
def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)
self.novel_title = " ".join(
[str(x) for x in soup.select_one(".post-title h1").contents if not x.name]
).strip()
logger.info("Novel title: %s", self.novel_title)
try:
self.novel_cover = self.absolute_url(
soup.select_one(".summary_image img")["data-src"]
)
except Exception:
pass
logger.info("Novel cover: %s", self.novel_cover)
self.novel_author = " ".join(
[
a.text.strip()
for a in soup.select('.author-content a[href*="novel-author"]')
]
)
logger.info("%s", self.novel_author)
self.novel_id = soup.select_one(
".wp-manga-action-button[data-action=bookmark]"
)["data-post"]
logger.info("Novel id: %s", self.novel_id)
for span in soup.select(".page-content-listing span"):
span.extract()
logger.info("Sending post request to %s", post_chapter_url)
response = self.submit_form(
post_chapter_url,
data={"action": "manga_get_chapters", "manga": int(self.novel_id)},
)
soup = self.make_soup(response)
for a in reversed(soup.select(".wp-manga-chapter > a")):
chap_id = len(self.chapters) + 1
vol_id = chap_id // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
contents = soup.select_one("div.text-left")
for bad in contents.select(
"h3, .code-block, script, .adsbygoogle, .adsense-code, .sharedaddy, a, br, .dw-reactions"
):
bad.extract()
for content in contents.select("p"):
for bad in [
"*** Can’t wait until tomorrow to see more? Want to show your support? to read premium additional chapters ahead of time!",
"[T/N Note: To Get more Free chapters Quickly Support us on",
". For more and better novels updates. If you have any suggestions, please give it on the comment box or contact us on",
]:
if bad in content.text:
content.extract()
return self.cleaner.extract_contents(contents)