-
Notifications
You must be signed in to change notification settings - Fork 2
/
wiki.py
166 lines (128 loc) · 4.26 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import json
import lxml.html
import re
from requests.exceptions import HTTPError
from urllib.parse import quote, unquote
import web
r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
r_tag = re.compile(r'<(?!!)[^>]+>')
r_whitespace = re.compile(r'[\t\r\n ]+')
r_redirect = re.compile(
r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
)
abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs',
'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit',
'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
+ list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
+ list('abcdefghijklmnopqrstuvwxyz')
no_abbr = ''.join('(?<! ' + abbr + ')' for abbr in abbrs)
breaks = re.compile('({})+'.format('|'.join([
no_abbr + '[.!?](?:[ \n]|\[[0-9]+\]|$)',
'。', '。', '.', '!', '?',
])))
def format_term(term):
term = term.replace(' ', '_')
term = term[0].upper() + term[1:]
return term
def deformat_term(term):
term = term.replace('_', ' ')
return term
def format_section(section):
section = section.replace(' ', '_')
section = quote(section)
section = section.replace('%', '.')
section = section.replace(".3A", ":")
return section
def parse_term(origterm):
if "#" in origterm:
term, section = origterm.split("#")[:2]
term, section = term.strip(), section.strip()
else:
term = origterm.strip()
section = None
return (term, section)
def good_content(text, content):
if text.tag not in ['p', 'ul', 'ol']:
return False
if not content.strip():
return False
if not breaks.search(content):
return False
if text.find(".//span[@id='coordinates']") is not None:
return False
return True
def search_content(text):
if text is None:
return None
content = text.text_content()
while not good_content(text, content):
text = text.getnext()
if text is None:
return None
content = text.text_content()
return content
def extract_snippet(match, origsection=None):
html, url = match
page = lxml.html.fromstring(html)
article = page.get_element_by_id('mw-content-text')
if origsection:
section = format_section(origsection)
text = article.find(".//span[@id='{0}']".format(section))
url += "#" + unquote(section)
if text is None:
return ("No '{0}' section found.".format(origsection), url)
text = text.getparent().getnext()
content = search_content(text)
if text is None:
return ("No section text found.", url)
else:
text = article.find('./p')
if text is None:
text = article.find('./div/p')
content = search_content(text)
if text is None:
return ("No introduction text found.", url)
sentences = [x.strip() for x in breaks.split(content)]
return (sentences[0], url)
class Wiki(object):
def __init__(self, endpoints):
self.endpoints = endpoints
@staticmethod
def unescape(s):
s = s.replace('>', '>')
s = s.replace('<', '<')
s = s.replace('&', '&')
s = s.replace(' ', ' ')
s = s.replace('"', '"')
return s
@staticmethod
def text(html):
html = r_tag.sub('', html)
html = r_whitespace.sub(' ', html)
return Wiki.unescape(html).strip()
def search(self, term):
try:
exactterm = format_term(term)
exactterm = quote(exactterm)
exacturl = self.endpoints['url'].format(exactterm)
html = web.get(exacturl)
return (html, exacturl)
except HTTPError:
pass
term = deformat_term(term)
term = quote(term)
apiurl = self.endpoints['api'].format(term)
try:
result = json.loads(web.get(apiurl))
except ValueError:
return None
result = result['query']['search']
if not result:
return None
term = result[0]['title']
term = format_term(term)
term = quote(term)
url = self.endpoints['url'].format(term)
html = web.get(url)
return (html, url)