-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-data-wiki.py
197 lines (172 loc) · 7.33 KB
/
get-data-wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import sys
import os
import wikipedia
import requests
import subprocess
import re
import rapidjson
from unicodedata import category as cat # for punctuation removal
import random
import shutil
from itertools import zip_longest
import paramiko
import getpass
def check_pwd(address, port, usr, pwd):
try:
client = paramiko.client.SSHClient()
client.load_system_host_keys() # this loads any local ssh keys
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(address, port=port, username=usr, password=pwd)
client.close()
return True
except:
return False
def sftp(address, port, usr, pwd, remworkdir, fname):
# try:
# print("sftp port " + str(port) + " of " + usr + "@" + address + ", transferring : " +
# remworkdir+fname)
client = paramiko.client.SSHClient()
client.load_system_host_keys() # this loads any local ssh keys
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(address, port=port, username=usr, password=pwd)
sftp = client.open_sftp() # type SFTPClient
sftp.put('.'+fname, remworkdir+fname) #src, dest path/filename
print('\ttransferred', fname)
client.close()
# except IOError:
# print(".. host " + address + " is not up or some other error occured")
# return "host not up", "host not up"
if __name__ == '__main__':
#TODO: commandlineargs address, port, usr, statmsgs
address = ''
port = ''
username = ''
remworkdir = ''
statmsgs = True # whether or not to output mid-language status updates
print(sys.argv)
if(len(sys.argv) != 6):
print('USAGE: python3 get-data-wiki.py <sftp address> <sftp port> <sftp username> <remote workdir> <verbose (0/1)>')
exit()
else:
address = sys.argv[1]
port = int(sys.argv[2])
username = sys.argv[3]
remworkdir = sys.argv[4]
statmsgs = bool(int(sys.argv[5]))
outpath = './'
langs = wikipedia.languages() #dictionary where key is code, value is language name in that language
# authenticate to storage device
authenticated = False
pwd = ''
while not authenticated:
pwd = getpass.getpass(prompt='sftp password: ')
authenticated = check_pwd(address, port, username, pwd)
if not authenticated:
print('authentication failed. try again.')
else:
print('authenticated.')
# create dumps, extracted dirs
if 'dumps' not in os.listdir():
os.mkdir('dumps')
if 'extracted' not in os.listdir():
os.mkdir('extracted')
if 'texts' not in os.listdir():
os.mkdir('texts')
if 'chunked' not in os.listdir():
os.mkdir('chunked')
skip = True
print('status\t\tcode\tlanguage name')
print('------------------------------------')
for lang in langs.keys():
if lang =='en': ## useful for debugging
skip = False
continue
elif skip:
continue
# else:
# break
try:
# don't download dump if already have it from previous run:
if lang + '-raw.xml.bz2' not in os.listdir('./dumps/') and lang + '.txt' not in os.listdir('./extracted/'):
# download the dump, save raw to file
dumpname = lang + 'wiki-latest-pages-articles.xml.bz2'
dumplink = 'https://dumps.wikimedia.org/' + lang + 'wiki/latest/' + dumpname
r = requests.get(dumplink, allow_redirects=True)
fsz = int(r.headers['Content-length'])
# print(fsz)
if fsz < 2**20: #filesize less than 1 MB, can't do much with it so skip
print('wiki too small', lang, langs[lang], sep='\t')
continue
open('./dumps/'+lang+'-raw.xml.bz2', 'wb').write(r.content)
except: #closed wikipedias throw an error when you try to download
print('empty wiki', lang, langs[lang], sep='\t')
continue
print('proceeding...', lang, langs[lang], sep='\t') # means this language
#will be included in the corpus
#remove XML
if lang + '.txt' not in os.listdir('./extracted/'):
if statmsgs: print('\tremove XML')
subprocess.call(['sh', './get-data-wiki.sh', './dumps/'+lang+'-raw.xml.bz2', lang])
elif statmsgs: print('reading from backup extracted')
fname = './extracted/' + lang + '.txt' #temp file created by the shell script
# send to device
sftp(address, port, username, pwd, remworkdir,
'/extracted/' + lang + '.txt') #sftp NEEDS *this* filename format, do not touch!
f = open(fname, 'r')
l = f.readlines()
f.close()
if statmsgs: print('\tclean 1')
pat = re.compile('\\n|\d|https?://.*|<.*;|__.*__')
textsarr = []
for line in l:
textsarr.append(re.sub(pat, ' ', line))
texts = ' '.join(textsarr)
# final clean: remove punctuation, sequences of multiple spaces
if statmsgs: print('\tclean 2')
texts2 = ''
for char in texts:
# punctuation pipeline newline numbers, ², fractions, etc
if (not cat(char).startswith('P')) and (char != '|') and (char !='\n') and (not cat(char).startswith('N')):
texts2 += char
texts = texts2
pattern = re.compile(r' +')
texts = re.sub(pattern, ' ', texts) #replace multiple spaces with just one
# write non-chunked text to file
slen = len(texts) # num chars
sz_limit = 2**27 # 2 gigabytes =2^30. Divide by 8=2^3 bits per char, 2^27
num_parts = (slen // sz_limit) + 1
fsz = int(slen/num_parts)
for i in range(num_parts):
f = open('./texts/' + lang + str(i) + '.txt', 'w')
f.writelines(texts[fsz*i: fsz*(i+1)] + '\n')
f.close()
# send to device
sftp(address, port, username, pwd, remworkdir, '/texts/' + lang + str(i) + '.txt')
# split to 500-char chunks
if statmsgs: print('\tchunking')
n = 500 # size of a chunk/line
# Group function using zip_longest to split
def group(n, iterable, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(fillvalue=fillvalue, *args)
chunks = [''.join(lis) for lis in group(n, texts, '')]
# shuffle chunks, then limit to 10000 chunks per language
if statmsgs: print('\tshuffling')
random.seed(5)
chunklimit = 10000
if len(chunks) > chunklimit:
chunks = random.sample(chunks, chunklimit) # choose chunklimit chunks at random
random.shuffle(chunks)
# write to file
if statmsgs: print('\twriting to file')
f = open('./chunked/' + lang + '.txt', 'w')
f.writelines([c+'\n' for c in chunks])
f.close()
# send to device then delete locally
sftp(address, port, username, pwd, remworkdir, '/chunked/' + lang + '.txt')
# clean
os.remove('./chunked/' + lang + '.txt')
for i in range(num_parts):
os.remove('./texts/' + lang + str(i) + '.txt')
os.remove('./extracted/' + lang + '.txt')
os.remove('./dumps/'+lang+'-raw.xml.bz2') #delete the raw dump too