This repository has been archived by the owner on Apr 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
document.py
168 lines (138 loc) · 4.63 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from urllib.error import HTTPError
from collections import namedtuple
from util import pq, List, Text, OrderedSet
_excludes = (
'Recommended_Readings',
'See_Also',
'Residents',
'Alternate_Reality_Versions'
)
scrape_excludes = List(
[
*_excludes,
'Links_and_References',
'References',
'Points_of_Interest',
'Links',
'Related'
],
format=':not(#{item})',
str=''
)
Handle = namedtuple('Handle', ('pointer', 'until'))
Fragment = namedtuple('Fragment', ('handle', 'text'))
def _text(el, to_strip=None):
if el is None:
return None
text = el.text() if isinstance(el, pq) else el
return text.strip().strip(to_strip).strip()
class Document:
def __init__(self, url=None, name=None, quotes=False, prepare=False):
if name is not None:
url = url.format(*name.split('|'))
self.name = name
else:
self.name = '|'.join([url.subdomain, url.basename])
self.url = str(url)
try:
doc = pq(url=self.url)
except HTTPError:
doc = pq([])
self.caption = doc.children('head > title').text().split('|', 1)[0].strip()
self.site = doc.find('link[rel="search"]').default('title', '').rstrip('(en)').strip()
self._doc = doc
self.__content = None
self.__h2 = None
self._fragments = None
self._refs = True
self._quotes = quotes
self._isel = 'text, a, b, i, em, strong, span'
sel = List([self._isel, 'p, ul, ol'])
if self._quotes:
sel.append('.quote')
self._sel = str(sel)
if prepare:
iter(self)
def __bool__(self):
return bool(self._doc)
def _content(self):
if self.__content is None:
content = self._doc.find('.mw-content-text')
content.find('.noprint, noscript, script, style, link, iframe, embed, video, img, .editsection').remove()
self.__content = content
return self.__content
def __iter__(self):
if not self:
raise StopIteration
if self._fragments is not None:
yield from self._fragments
self._fragments = {}
content = self._content()
self._fragment(content.children('.portable-infobox'), name='Summary', until='#toc')
h2_list = content.children(f'h2{scrape_excludes} > {scrape_excludes}').closest('h2')
for h2 in h2_list.items():
self._fragment(h2)
for h3 in h2.nextUntil('h2', 'h3').items():
self._fragment(h2, h3)
yield from tuple(self._fragments)
def _fragment(self, *pointer, name=None, until=None):
if not name:
name = '/'.join(_text(h) for h in pointer)
name = f"{self.name}#{_text(name)}"
fragment = Fragment(Handle(pointer[-1], until), None)
self._fragments[name] = fragment
def __getitem__(self, name):
if name not in self._fragments:
return
handle, text = self._fragments[name]
if text is not None:
return text
content = self._content()
if self._refs:
content.find('.reference').remove()
self._refs = False
if self._quotes:
for quote in content.find('.quote').items():
author = quote.find('.selflink').closest('b')
author.closest('dl').remove()
_quote = quote.find('i')
_quote.text('"' + _text(_quote, '"\'') + '"')
author.append('said').prependTo(_quote.closest('dd'))
self._quotes = None
pointer, until = handle
if not until:
until = 'h2, h3'
if pointer.children('span').is_('#Abilities, #Equipment, #Transportation, #Weapons'):
body = pq([])
for li in pointer.nextUntil('h2, h3', 'ul').children('li').items():
nodes = pq([])
for node in li.contents().items(exclude='ul, b > a'):
nodes.extend(Text(_text(node).lstrip(': ').rstrip() + ' ') if node.prev().is_('b > a') else node)
body.extend(pq('<p>').append(nodes))
else:
body = pointer.nextUntil(until, self._sel)
return self._create(name, body)
def _create(self, name, body):
fragment = self._fragments.get(name)
if not body:
del self._fragments[name]
return
text = List(banned=False, str='\n')
span = List(banned=False, str=' ')
for node in body.items():
if node.is_(self._isel):
span.append(_text(node))
else:
text.extend([str(span), _text(node)])
span.clear()
text.append(str(span))
text_ = str(text)
if not text_:
del self._fragments[name]
return
self._fragments[name] = Fragment(fragment.handle, text_)
return text_
@staticmethod
def parse_name(name):
name, heads = name.split('#')
return name, heads.split('/')