This repository has been archived by the owner on Oct 21, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
wikidata.py
135 lines (104 loc) · 3.69 KB
/
wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import gzip
import os
import re
import subprocess
import sys
# Download file and save in cache directory, if it doesn't already exist
def download(url):
# Define the file name
outputfilename = 'cache/' + url.split('/')[-1]
# Create the cache directory
try:
os.mkdir('cache')
except:
pass
# Check if file already exists
if os.path.isfile(outputfilename):
print("%s already exists" % outputfilename)
return outputfilename
# Download file
print("Downloading %s" % outputfilename)
wget = subprocess.Popen(
['wget', url], cwd=outputfilename.split('/')[0], stdout=subprocess.PIPE
)
wget.communicate()
print("\nDone")
return outputfilename
# Throw away all triples with a language-tagged object
# where the language is not English.
def english_please(inputfilename):
# Make sure the output folders exist
try:
os.mkdir('output')
except:
pass
# Check if file already exists
outputfilename = 'output/' + '-'.join(
[inputfilename.split('/')[-1].split('-')[0]] +
['english'] +
inputfilename.split('/')[-1].split('-')[1:]
)
if os.path.isfile(outputfilename):
print("%s already exists" % outputfilename)
return outputfilename
print('Throwing away all triples containing non-English objects')
# Open UTF-8 encoded, gzipped input file
with codecs.getreader('utf-8')(gzip.open(inputfilename, 'rb')) \
as inputfile:
# Open UTF-8 encoded, gzipped output file
with codecs.getreader('utf-8')(gzip.open(outputfilename, 'wb')) \
as outputfile:
counter = 0
for line in inputfile:
# Extract the language of the triple's object, if one is set
language = re.search(r'\@([^ ]+) \.$', line)
# If there is no language set or the language is English ...
if not language or (language and language.group(1) == 'en'):
# ... write the line to the output file
outputfile.write(line)
counter += 1
if counter % 100000 == 0:
sys.stdout.write('.')
sys.stdout.flush()
print('\nDone')
return outputfilename
# Make wikidata-properties.nt.gz compatible with
# wikidata-simple-statements.nt.gz
def simple_properties(inputfilename):
# Make sure the output folders exist
try:
os.mkdir('output')
except:
pass
# Check if file already exists
outputfilename = '-'.join(
inputfilename.split('-')[0:2] + ['simple'] + inputfilename.split('-')[2:]
)
if os.path.isfile(outputfilename):
print("%s already exists" % outputfilename)
return outputfilename
print('Making properties compatible with simple statements')
# Open UTF-8 encoded, gzipped input file
with codecs.getreader('utf-8')(gzip.open(inputfilename, 'rb')) \
as inputfile:
# Open UTF-8 encoded, gzipped output file
with codecs.getreader('utf-8')(gzip.open(outputfilename, 'wb')) \
as outputfile:
counter = 0
for line in inputfile:
# Repair the subject and write the line to the output file
outputfile.write(
re.sub(
r'/entity/P(\d+)',
r'/entity/P\1c',
line
)
)
counter += 1
if counter % 1000 == 0:
sys.stdout.write('.')
sys.stdout.flush()
print('\nDone')