-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsearticles.py
executable file
·195 lines (149 loc) · 5.39 KB
/
parsearticles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python
'''
http://www.w3schools.com/xpath/xpath_axes.asp
parse articles,
rewrite urls
rewrite image links
remove links
rewrite links
add link classes
'''
import sys, os
from lxml import etree
import layout
def rewrite_external_urls(troot, article_names):
# rewrite all non local links to full url
'''
Within Wiki (class:localwiki):
/wiki/Germany
External Link (class:external):
http://www.citymayors.com/features/quality_survey.html
External Wiki (class:extiw):
//wikimediafoundation.org/wiki/Terms_of_use?useformat=mobile
http://en.m.wikipedia.org/w/index.php?title=Munich&action=history
/w/index.php?title=Special:MobileFeedback&returnto=Munich&feedbacksource=MobileFrontend
Interwiki Link (class:remotewiki):
/wiki/BuxtehudeNotLocal
/wiki/Wikipedia:General_disclaimer
To remove links:
http://en.wikipedia.org/w/index.php?title=Munich&mobileaction=toggle_view_desktop
Rules:
mw-redirect
image
internal
button
'''
def addClass(e, cname):
klasses = e.get('class','').split(' ')
if not cname in klasses:
klasses.append(cname)
e.set('class',' '.join(x.strip() for x in klasses if x.strip()))
for a in troot.xpath('.//a'):
# print
# print a.get('class'), a.get('href')
url = a.get('href')
if url.startswith('#'):
# relative link
pass
elif url.startswith('//'):
# Wikimedia Project
a.set('href', 'http:' + url)
addClass(a, 'extiw')
addClass(a, 'external')
elif url.startswith('/wiki/'):
# Wikilink
name = url[6:] # fixme decode
if name in article_names:
a.set('href', name)
addClass(a, 'localwiki')
else:
a.set('href', layout.external_wiki_page_link(name))
addClass(a, 'remotewiki')
addClass(a, 'external')
elif url.startswith('http'):
# external link
addClass(a, 'external')
elif url.startswith('/w/'):
# special link
a.set('href', layout.external_wiki_link(url))
addClass(a, 'extiw')
addClass(a, 'external')
elif url.startswith('ftp://'):
pass
else:
# print a, a.get('name'), a.get('href'), [(e, e.get('class')) for e in a.xpath('ancestor-or-self::*[@class]')]
print 'unknown', url, a.get('class')
# print a.get('class'), a.get('href')
def replace_image_src(troot):
# KISS
for i in troot.xpath('.//img'):
src = i.get('src')
try:
src = layout.ext_img_url2local_url(src)
i.set('src', src)
except UnicodeEncodeError:
print "UnicodeERR", repr(src)
def rewrite_styles(troot):
for num,l in enumerate(troot.xpath('.//link[@rel="stylesheet"]')):
#print i, l.get('rel'), l.get('href')
l.set('href', layout.stylesheet_url(num))
def rewrite_scripts(troot):
# for some reason there are no scripts in our downloads
urls = []
for num,l in enumerate(troot.xpath('.//script')):
#print i, l.get('rel'), l.get('href')
url = l.get('href')
if url:
urls.append(url)
l.set('href', layout.script_url(num))
if urls:
print urls
def remove_language_section(troot):
#<div class="section" id="mw-mf-language-section">
for s in troot.xpath('.//div[@id="mw-mf-language-section"]'):
s.xpath('..')[0].remove(s)
def remove_class_noprint(troot):
# <div class="noprint plainlinks hlist navbar mini" style="">
for e in troot.xpath('.//*[contains(@class, "noprint")]'):
e.xpath('..')[0].remove(e)
def remove_ambox(troot):
for e in troot.xpath('.//*[contains(@class, "ambox")]'):
e.xpath('..')[0].remove(e)
def replace_menu_button(troot):
# <a id="mw-mf-main-menu-button" class="remotewiki external" href="http://en.m.wikipedia.org/wiki/Special:MobileMenu#mw-mf-page-left" title="Open main menu">
s = troot.xpath('.//a[@id="mw-mf-main-menu-button"]')[0]
s.set('href', '../')
s.set('title', 'Home')
s.set('class', 'homelink')
def edit_search_box(troot):
'''<form id="mw-mf-searchForm" class="search_bar" method="get" action="/w/index.php">
'''
s = troot.xpath('.//form[@id="mw-mf-searchForm"]')[0]
s.set('action', '../search')
def parse_article(fn, article_names):
parser = etree.HTMLParser()
tree = etree.parse(open(fn), parser)
troot = tree.getroot()
replace_image_src(troot)
edit_search_box(troot)
remove_language_section(troot)
remove_class_noprint(troot)
remove_ambox(troot)
rewrite_external_urls(troot,article_names)
replace_menu_button(troot)
rewrite_styles(troot)
rewrite_scripts(troot)
return troot
def main(articles_dir, out_dir):
articles = [x.strip() for x in os.listdir(articles_dir)]
for name in articles:
fn = os.path.join(articles_dir, layout.safe_fn(name.strip()))
ofn = os.path.join(out_dir, layout.safe_fn(name.strip()))
if not os.path.exists(ofn):
print fn
tree = parse_article(fn, articles)
open(ofn,'w').write('<!DOCTYPE html>\n' + etree.tostring(tree))
return tree
if __name__=='__main__':
tree = main( articles_dir=sys.argv[1],
out_dir=sys.argv[2])