-
-
Notifications
You must be signed in to change notification settings - Fork 307
/
Copy pathqidianunderground.py
135 lines (116 loc) · 4.69 KB
/
qidianunderground.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""
Decryptor: https://github.com/Pioverpie/privatebin-api/blob/master/privatebinapi/download.py
"""
import logging
import re
from datetime import datetime
from urllib.parse import urlsplit
import regex
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
novel_list_url = "https://toc.qidianunderground.org/api/v1/pages/public"
chapter_list_url = "https://toc.qidianunderground.org/api/v1/pages/public/%s/chapters"
chapter_body_url = "/?pasteid=%s"
class QidianComCrawler(Crawler):
base_url = [
"https://toc.qidianunderground.org/",
"https://tocqidianunderground.blogspot.com/",
]
def initialize(self):
self.init_executor(1)
self.chapter_cache = {}
self.set_header("Accept", "application/json")
@property
def novel_list(self):
if not hasattr(self, "_novel_list"):
data = self.get_json(novel_list_url)
self._novel_list = {x["ID"]: x for x in data}
return self._novel_list
def search_novel(self, query):
query = query.strip().lower()
spaces = len(query.split(" "))
query = regex.compile("(%s){e<=%d}" % (query, spaces))
results = []
for novel in self.novel_list.values():
m = query.search(novel["Name"].lower())
if m:
last_update = datetime.fromtimestamp(novel["LastUpdated"])
last_update = last_update.strftime("%Y-%m-%d %I:%M:%S %p")
results.append(
{
"title": novel["Name"],
"url": chapter_list_url % novel["ID"],
"info": "Last Updated: %s" % last_update,
"score": sum(len(x) for x in m.groups()),
}
)
return list(sorted(results, key=lambda x: -x["score"]))[:10]
def read_novel_info(self):
if self.novel_url.startswith("https://tocqidianunderground"):
soup = self.get_soup(self.novel_url)
meta = soup.select_one('meta[property="og:title"]')
assert meta, "No title found"
data = self.search_novel(meta["content"])
assert len(data) > 0, "No such novel found"
self.novel_url = data[0]["url"]
novel_id = self.novel_url.split("/")[-2]
self.novel_title = self.novel_list[novel_id]["Name"]
data = self.get_json(self.novel_url)
for vol_id, item in enumerate(data, 1):
if "-" in item["Text"]:
start_ch, final_ch = re.findall(r"(\d+) - (\d+)", item["Text"])[0]
self.volumes.append(
{
"id": vol_id,
"title": "Chapters %s - %s" % (start_ch, final_ch),
}
)
for j in range(int(start_ch), int(final_ch) + 1):
self.chapters.append(
{
"id": j,
"volume": vol_id,
"url": item["Href"],
}
)
else:
self.volumes.append(
{
"id": vol_id,
"title": "Chapters %s" % (item["Text"]),
}
)
self.chapters.append(
{
"id": int(item["Text"]),
"volume": vol_id,
"url": item["Href"],
}
)
def download_chapter_body(self, chapter):
from lncrawl.utils.pbincli import PasteV2
url_data = urlsplit(chapter["url"])
pasteHost = url_data.scheme + "://" + url_data.netloc
pasteId = url_data.query
passphrase = url_data.fragment
if pasteId in self.chapter_cache:
soup = self.chapter_cache[pasteId]
else:
data = self.get_json(pasteHost + (chapter_body_url % pasteId))
paste = PasteV2()
paste.setHash(passphrase)
paste.loadJSON(data)
paste.decrypt()
soup = self.make_soup(paste.getText())
self.chapter_cache[pasteId] = soup
a = soup.select_one('#toc a[href*="chapter-%d"]' % chapter["id"])
chapter["title"] = a.text.strip()
logger.debug(chapter["title"])
logger.debug("Chapter Id: %s", a["href"])
contents = soup.find("div", attrs={"id": a["href"][1:]})
contents = contents.find("div", attrs={"class": "well"})
for bad in contents.select("h2, br"):
bad.extract()
body = contents.text.split("\n\n")
return "<p>" + "</p><p>".join(body) + "</p>"