forked from PeterFeicht/cppreference-doc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
export.py
executable file
·94 lines (72 loc) · 2.96 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
'''
Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
This file is part of cppreference-doc
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
'''
import argparse
import json
import urllib.parse
import urllib.request
def retrieve_page_names(root, ns_index):
begin = None
pages = []
while True:
params = {
'action': 'query',
'list': 'allpages',
'apnamespace': ns_index,
'aplimit': 500,
'format': 'json'
}
if begin is not None:
params['apcontinue'] = begin
url = "{0}/api.php?{1}".format(root, urllib.parse.urlencode(params))
with urllib.request.urlopen(url) as f:
data = json.loads(f.read().decode('utf-8'))
pages += [p['title'] for p in data['query']['allpages']]
if ('query-continue' in data and
'allpages' in data['query-continue'] and
'apcontinue' in data['query-continue']['allpages']):
begin = data['query-continue']['allpages']['apcontinue']
else:
return pages
def export_pages(root, pages, output_path):
params = {
'wpDownload': '',
'curonly': 1,
'pages': '\n'.join(pages)
}
data = urllib.parse.urlencode(params)
data = data.encode('ascii')
url = "{0}/index.php?title=Special:Export&action=submit".format(root)
urllib.request.urlretrieve(url, output_path, data=data)
def main():
parser = argparse.ArgumentParser(prog='export.py')
parser.add_argument('--url', type=str,
help='The URL to the root of the MediaWiki '
'installation')
parser.add_argument('output_path', type=str,
help='The path to the XML file to save output to')
parser.add_argument('ns_index', type=str, nargs='+',
help='The indices of the namespaces to retrieve')
args = parser.parse_args()
pages = []
for ns_index in args.ns_index:
new_pages = retrieve_page_names(args.url, ns_index)
print("Retrieved {0} pages for namespace {1}".format(len(new_pages),
ns_index))
pages += new_pages
pages = sorted(pages)
export_pages(args.url, pages, args.output_path)
if __name__ == "__main__":
main()