This repository has been archived by the owner on Dec 21, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathxml_to_json.py
executable file
·226 lines (199 loc) · 8.37 KB
/
xml_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python
"""Translate Wikipedia edit history XML files to JSON.
This script assumes that the Wikipedia edit history XML files are ordered as
follows:
<mediawiki>
...
<page>
<title></title>
<ns></ns>
<id></id>
<redirect title="" /> <!-- Optional -->
<revision>
<id>236939</id>
<timestamp></timestamp>
<contributor>
<username></username>
<id></id>
</contributor>
<comment></comment>
<model></model>
<format></format>
<text xml:space="preserve"></text>
<sha1></sha1>
</revision>
...
</page>
</mediawiki>
Specifically, it assumes that the title, ns, id, redirect tags for a page
appear before the revisions.
This is the format the files are in when they are downloaded from:
https://dumps.wikimedia.org/enwiki/latest/
Example:
This script is designed to be placed in the middle of a series of piped
commands. Most often it will read data on stdin from a 7zip program and
output JSON on stdout, often to a file or compression program, as follows:
$ 7z e -so input_file.7z | xml_to_json.py | gzip > output_file.json.gz
Attributes:
NAMESPACE (str): The default namespace of all tags in the Wikipedia edit
history XML file.
REVISION (dict): A dictionary that stores the information from each
revision as well as some information about the article in general. The
variables are as follows:
- article_title (str): The title of the article.
- article_id (int): A unique identifier for each article.
- article_namespace (int): The namespace of the article, as defined
by Wikipedia.
- redirect_target (str): If the article is a redirect, the target
article's article_title, otherwise None.
- revision_id (int): Unique identifier of the revision.
- parent_id (int): Unique identifier of the parent of the revision.
- timestamp (str): Time the revision was made.
- user_name (str): The name of the user, or their IP address if not
logged in.
- user_id (int): A unique id of the user, or None if they were not
logged in.
- comment (str): The comment the user left about their edit.
- minor (bool): True if the edit was marked as "minor", otherwise
False.
"""
from copy import deepcopy
from datetime import datetime
import argparse
import sys
import xml.etree.cElementTree as ET
import json
NAMESPACE = "{http://www.mediawiki.org/xml/export-0.10/}"
# JSON revision object
REVISION = {
# General article information
"article_title": None, # string
"article_id": None, # int
"article_namespace": None, # int
"redirect_target": None, # String
# Revision specific information
"revision_id": None, # int
"parent_id": None, # int
"timestamp": None, # date and time
"user_name": None, # string or ip as string
"user_id": None, # int if user, otherwise None
"comment": None, # string
"minor": False, # bool
"full_text": None, # str
}
def fill_rev(revision, element, in_revision_tree, namespace='', save_full_text=False):
"""Fill the fields of a revision dictionary given an element from an XML
Element Tree.
Args:
- revision (dict): A revision dictionary with fields already in place.
- element (ElementTree Element): An element from ElementTree.
- in_revision_tree (bool): True if inside a <revision> element,
otherwise should be set to False.
- namespace (Optional[str]): The XML name space that the tags exist in.
- save_full_text (Optional [bool]): Save the full text of the article
if True, otherwise set it to None.
Returns:
None
"""
if element.tag == namespace + "id":
if in_revision_tree:
revision["revision_id"] = int(element.text)
else:
revision["article_id"] = int(element.text)
elif element.tag == namespace + "parentid":
revision["parent_id"] = int(element.text)
elif element.tag == namespace + "timestamp":
revision["timestamp"] = element.text
elif element.tag == namespace + "minor":
revision["minor"] = True
elif element.tag == namespace + "comment":
revision["comment"] = element.text
elif element.tag == namespace + "contributor":
for child in element:
if child.tag == namespace + "username" or child.tag == namespace + "ip":
revision["user_name"] = child.text
elif child.tag == namespace + "id":
revision["user_id"] = int(child.text)
elif element.tag == namespace + "title":
revision["article_title"] = element.text
elif element.tag == namespace + "ns":
revision["article_namespace"] = int(element.text)
elif element.tag == namespace + "redirect":
revision["redirect_target"] = element.get("title")
elif element.tag == namespace + "text":
if save_full_text:
revision["full_text"] = element.text
# Set up command line flag handling
parser = argparse.ArgumentParser(
description="Transform Wikipedia XML to JSON.",
usage="7z e -so input.7z | %(prog)s [options] > output.json",
)
parser.add_argument(
'-f',
'--full-text',
help="keep the full text of each article, otherwise set it to None",
action="store_true",
dest="save_full_text",
default=False,
)
parser.add_argument(
'-l',
'--latest-revision-only',
help="keep only the latest revision of an article",
action="store_true",
dest="save_newest_only",
default=False,
)
# Run only if this script is being called directly
if __name__ == "__main__":
args = parser.parse_args()
in_page = False
in_revision = False
for event, elem in ET.iterparse(sys.stdin, events=("start", "end")):
# When a page element is started we set up a new revisions dictionary
# with the article title, id, namespace, and redirection information.
# This dictionary is then deepcopied for each revision. It is deleted
# when a page ends.
if event == "start" and elem.tag == NAMESPACE + "page":
in_page = True
page_rev = deepcopy(REVISION)
if args.save_newest_only:
newest = None
newest_date = None
elif event == "end" and elem.tag == NAMESPACE + "page":
if args.save_newest_only:
print json.dumps(newest)
del newest
del newest_date
in_page = False
del cur_rev
del page_rev
# When a revision starts we copy the current page dictionary and fill
# it. Revisions are sorted last in the XML tree, so the page_rev
# dictionary will be filled out by the time we reach them.
if event == "start" and elem.tag == NAMESPACE + "revision":
in_revision = True
cur_rev = deepcopy(page_rev)
elif event == "end" and elem.tag == NAMESPACE + "revision":
for child in elem:
fill_rev(cur_rev, child, in_revision, NAMESPACE, args.save_full_text)
child.clear()
in_revision = False
if not args.save_newest_only:
print json.dumps(cur_rev)
else:
if not newest:
newest = cur_rev
newest_date = datetime.strptime(cur_rev["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
else:
test_date = datetime.strptime(cur_rev["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
if test_date > newest_date:
newest = cur_rev
newest_date = test_date
elem.clear()
# Otherwise if we are not in a revision, but are in a page, then the
# elements are about the article and we save them into the page_rev
# dictionary
if event == "end" and in_page and not in_revision:
fill_rev(page_rev, elem, in_revision, NAMESPACE, args.save_full_text)
elem.clear()