-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_titles.py
executable file
·100 lines (85 loc) · 2.56 KB
/
get_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/python3
import urllib.parse
import requests
import json
import os
from itertools import islice
from pprint import pprint
user_agent = 'Program converter/0.1'
query_url = 'https://wikimania.wikimedia.org/w/api.php'
start = '{{Program item|'
start2 = start + 'title='
page_size = 50
def tidy_title(title):
return urllib.parse.unquote(title.replace('_', ' '))
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
def run_query(titles, params):
base = {
'format': 'json',
'formatversion': 2,
'action': 'query',
'continue': '',
'titles': '|'.join(titles),
}
p = base.copy()
p.update(params)
r = requests.get(query_url, params=p, headers={'User-Agent': user_agent})
json_reply = r.json()
if 'query' not in json_reply:
pprint(json_reply)
return json_reply['query']['pages']
def extracts_query(titles):
params = {
'prop': 'extracts',
'exlimit': 'max',
# 'exintro': '1',
'explaintext': '1',
}
return run_query(titles, params)
def get_extracts(titles):
for cur in chunk(titles, page_size):
for page in extracts_query(cur):
yield page
def get_page_iter(titles):
for cur in chunk(titles, page_size):
params = {'prop': 'revisions', 'rvprop': 'ids|content'}
pages = run_query(cur, params)
for page in pages:
yield page
filename = max(f for f in os.listdir('.') if f[0].isdigit())
titles = ['2019:Program/Free Knowledge and the Global Goals Spotlight Session']
for line in open(filename):
if start not in line or '}}' not in line:
continue
item = line[line.find(start):line.rfind('}}') + 2]
if not item.startswith(start2):
print(line)
assert item.startswith(start2)
title = item[len(start2):item.find('|', len(start2))].strip()
if not title or title == 'Test':
continue
title = tidy_title(title)
titles.append(title)
def get_pages():
for page in get_page_iter(titles):
if 'missing' in page:
continue
if 'pageid' not in page:
print(json.dumps(page, indent=2))
pageid = page['pageid']
out = open(f'pages/{pageid:05d}.json', 'w')
json.dump(page, out)
out.close()
def extracts():
for page in get_extracts(titles):
print(page)
if 'missing' in page:
continue
pageid = page['pageid']
out = open(f'extracts/{pageid:05d}.json', 'w')
json.dump(page, out)
out.close()
get_pages()
extracts()