-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathItemRelationsCsvReader.py
74 lines (61 loc) · 2.12 KB
/
ItemRelationsCsvReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
read_csv returns a generator that yields the tuple (title, [(p1, dt1, v1), (p2, dt1, v2),..])
where
p_n is a property
d_n is a datatype
v_n is a value
usage:
with open("file.csv", "r") as f:
for title, claim in read_csv(f):
do_things()
"""
import argparse, time
from CompressedFileType import CompressedFileType
def read_csv(input_file, seperator=","):
current_title = None
claims = []
for line in input_file:
title, prop, datatype, value = line.strip().split(seperator, 3)
if current_title != title:
if not current_title is None:
yield current_title, claims
current_title = title
claims = []
if datatype == "wikibase-entityid":
claims.append((prop, value))
if not current_title is None:
yield current_title, claims
def read_compressed_csv(input_file, seperator=","):
title = None
claims = []
for line in input_file:
if line[0] == "=":
if not title is None:
yield title, claims
title = line[1:].strip()
else:
prop, datatype, value = line.strip().split(seperator, 2)
if datatype == "wikibase-entityid":
claims.append((prop, value))
if not title is None:
yield title, claims
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input", help="The CSV input file (a wikidata dump), gzip is supported",
type=CompressedFileType('r'))
parser.add_argument("-c", "--compressed", help="Use compressed csv (every entity is shown only once)",
action="store_true")
parser.add_argument("-s", "--silent", help="Show output", action="store_true")
args = parser.parse_args()
if args.compressed:
read_method = read_compressed_csv
else:
read_method = read_csv
start = time.time()
if args.silent:
for element in read_method(args.input):
pass
else:
for element in read_method(args.input):
print element
print "total time: %.2fs"%(time.time() - start)