-
Notifications
You must be signed in to change notification settings - Fork 2
/
cleaner.py
353 lines (327 loc) · 9.89 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import csv
import subprocess
from pprint import pprint
import Levenshtein
from geopy import geocoders
from config import YAHOOID
# via http://www.iso.org/iso/country_codes
tab = requests.get('http://www.iso.org/iso/home/standards/country_codes/country_names_and_code_elements_txt.htm')
corresp = []
for line in tab.iter_lines():
print(line.split(";"))
try:
name, code = line.split(";")
corresp.append(dict(name=name, code=code))
except:
print("couldn't read{}".format(line.split(";")))
# via https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2, http://www.unece.org/cefact/locode/unlocode_manual.pdf
corresp.append(dict(name="INTERNATIONAL WATERS", code="XZ"))
corresp.append(dict(name="Netherlands Antilles",code="AN"))
with open('./source/corresp.csv','w') as f:
wtr = csv.writer(f, dialect="excel")
for c in corresp:
try:
code = c['code']
name = c['name']
wtr.writerow([code, name])
except:
print('FAILED TO WRITE',r)
subprocess.call("ssconvert ./source/report.xls ./source/report.csv", shell=True)
deathtoll = []
with open('./source/report.csv') as f:
vals = csv.DictReader(f)
for i in vals:
deathtoll.append(i)
for d in deathtoll:
d['Country'] = d["City"].rstrip(",.").split(",")[-1].strip()
gmap = geocoders.GoogleV3()
ymap = geocoders.Yahoo(YAHOOID)
def city2country(x):
try:
gres = gmap.geocode(x)
except:
gres = ''
if gres:
try:
place = gres[0].split(",")[-1]
except:
place = ''
else:
try:
yres = ymap.geocode(x)
place = yres[0].split(",")[-1]
except:
place = ''
country = str(place.strip())
return(country)
countryreps = {
"Paktika" : "Afghanistan",
"Barbuda" : "Antigua and Barbuda",
'Western Australia Australia' : "Australia",
"South Australia" : "Australia",
'South Austalia' : "Australia",
'Western Australia' : "Australia",
'New South Wales': "Australia",
'Tasmania Australia' : "Australia",
"Aghanistan" : "Afghanistan",
'Great Northern Highway Derby Western Australia': "Australia",
'Baden. Austria' : "Austria",
"Neusiedl/zaya" : "Austria",
'The Bahamas': "Bahamas",
"Turks & Caicos":"Turks and Caicos Islands",
"San Salvador" : "Bahamas",
"Eleuthera" : "Bahamas",
"Nassau" : "Bahamas",
"bimini" : "Bahamas",
"abaco" : "Bahamas",
"Abaco" : "Bahamas",
"Freeport" : "Bahamas",
"British Virgin Islands":'Virgin Islands, British',
"British West Indies": "Turks and Caicos Islands",
"Bolivia": "Bolivia, Plurinational State Of",
"Between English Caye And Turneffe Reef" : "International Waters",
"Teakettle Village, Cayo District" : "Belize",
"Cayo District" : "Belize",
"Orange Walk Town": "Belize",
"San Pedro Town": "Belize",
"St. Lucia":"Saint Lucia",
"Antigua": "Antigua and Barbuda",
"Boliviz" : "Bolivia, Plurinational State Of",
"Santa Cruz": "Bolivia, Plurinational State Of",
"Palapye":"Botswana",
"Rn Brazil" : "Brazil",
"Phnom Penh": "Cambodia",
"Sihanoukville": "Cambodia",
"Littoral": "Cameroon",
"British Columbia": "Canada",
"Quebec": "Canada",
"Grand Cayman": "Cayman Islands",
"Sichuan": "China",
"Shanghai":"China",
"Guangdong": "China",
"Heilongjiang":"China",
"Inner Mongolia":"China",
"Sangha":"China",
"Congo-Kinshasa":"Congo, The Democratic Republic Of The",
"Bas-congo":"Congo, The Democratic Republic Of The",
"Aguirre":"Costa Rica",
"San Isidro Del General San Jose":"Costa Rica",
"Ivory Coast": "CÔTE D'IVOIRE",
"Havana":"Cuba",
"Pinar Del Rio" : "Cuba",
"Santo Domingo" : "Dominican Republic",
"La Altagracia" : "Dominican Republic",
"Rio San Juan": "Dominican Republic",
"Puerto Plata": "Dominican Republic",
"Galapagos": "Ecuador",
"Daule" : "Ecuador",
"Azuay": "Ecuador",
"Esmeraldas": "Ecuador",
"Cairo": "Egypt",
"130 Km Far From Addis Ababa" : "Ethiopia",
"sigatoka": "Fiji",
"lautoka": "Fiji",
"Tahiti": 'French Polynesia',
"Nice 06000": "France",
"46250 Gindou":"France",
"Paris 75007":"France",
'Brazil- North 02? 58\'8" /west 030? 35\'4"':"Brazil",
"The Gambia" : "Gambia",
"89584 Ehingen (donau)": "Germany",
"Bonn" : "Germany",
"Frankfurt Am Main": "Germany",
"67655 Kaiserslautern" : "Germany",
"Retalhuleu" : "Guatemala",
"Gressier Haiti": "Haiti",
"Bay Islands Honduras": "Honduras",
"Santa Barbara" : "Honduras",
"Cortes": "Honduras",
"Intibuca": "Honduras",
"Atlantida":"Honduras",
"Francisco Moranzan": "Honduras",
"Reykjavik": "Iceland",
"Ahmedabad 380 007 Gujarat": "India",
"Mumbai": "India",
"Iran":"Iran, Islamic Republic Of",
"Baghdad":"Iraq",
"Between Fallujah And Baghdad":"Iraq",
"Southern Iraq":"Iraq",
"Mayo":"Ireland",
"Kerry":"Ireland",
"Jerusalem":"Israel",
"The West Bank":'Palestine, State Of', #!!!
"Gaza":"Palestine, State Of",
"Northern Israel": "Israel",
"Nothern Israel": "Israel",
"Mougins":"France",
"West Bank":'Palestine, State Of',
"Foggia": "Italy",
"Massino Visconti, Novara, A26 Km 177":"Italy",
"Negril. Jamaica":"Jamaica",
"Osaka Japan":"Japan",
"Atyrau":"Kazakhstan",
"Korea":'Korea, Republic Of',
"South Korea":'Korea, Republic Of',
"North Korea":"Korea, Democratic People'S Republic Of",
"A26 Km 177": "Italy",
"Taebaek":'Korea, Republic Of',
"Seoul":'Korea, Republic Of',
"Shaab Al Bahri":"Kuwait",
"Um Qasr Iraq/kuwait Border": "Kuwait",
"Laos":"Lao People's Democratic Republic",
"Loas":"Lao People's Democratic Republic",
"Sam Neua Laos": "Lao People'S Democratic Republic",
"Macedonia":"Macedonia, The Former Yugoslav Republic Of",
"Kosovo": "Serbia", #!!!
"North Lebanon": "Lebanon",
"The Lebanese-syrian Border": "Lebanon",
"South Lebanon":"Lebanon",
"Nouakchott":"Mauritania",
"Tamaulipas":"Mexico",
"Baja California":"Mexico",
"Guerrero":"Mexico",
"Quintana Roo":"Mexico",
"Chihuahua":"Mexico",
"Sonora":"Mexico",
"BC":"Mexico",
"Michoacan":"Mexico",
"Jalisco":"Mexico",
"Baja California Sur":"Mexico",
"Nuevo Leon":"Mexico",
"San Luis Potosi":"Mexico",
"Zacatecas":"Mexico"}
countryreps.update(
{
"Oax" : "Mexico",
"Nayarit" : "Mexico",
"Coahuila" : "Mexico",
"Oaxaca":"Mexico",
"Mich": "Mexico",
"Gto":"Mexico",
"Colima":"Mexico",
"Mich. Mexico":"Mexico",
"pue. Mexico":"Mexico",
"Ver. Mexico":"Mexico",
"Gro. Mexico":"Mexico",
"Mex":"Mexico",
"Sonora Mexico":"Mexico",
"baja California":"Mexico",
"san Luis Potosi":"Mexico",
"D.f":"Mexico",
"Mina Nuevo Leon":"Mexico",
"Qunitana Roo":"Mexico", # typo
"Sinaloa":"Mexico",
"Sinola": "Mexico",
"Tamps": "Mexico",
"Mulege Baja California":"California",
"Az" : "Mexico",
"Micronesia":'Micronesia, Federated States Of',
"Moldova": "Moldova, Republic Of",
"Nethrland Antilles":"Netherland Antilles",
"Caribbean Netherlands":"Netherland Antilles",
"Caracao Netherlands Antilles":"Netherland Antilles",
"Sint Maarten":"Netherland Antilles",
"Russia":"Russian Federation",
"Cupecoy Beach":"Netherland Antilles",
"Haarlemmermeer":"Netherlands",
"Maroa":"Netherlands",
"Esteli": "Nicaragua",
"Managua":"Nicaragua",
"Nigeria (offshore)":"International Waters",
"Off The Coast Of Nigeria":"International Waters",
"4291 Kopervik":"Norway",
"Muzaffarabad":"Pakistan",
"620000":"Russian Federation",
"Carretera #15 Mexico- Nogales Sonora Km 176+350":"Mexico",
"Federated States of Micronesia":"Micronesia, Federated States of",
"Udhailiyah Saudi Arabia":"Saudi Arabia",
"Dhahran Saudi Arabia":"Saudi Arabia",
"Serbia And Montenegro":"Serbia",
"UK":"United Kingdom",
"Uk":"United Kingdom",
"England":"United Kingdom",
"Scotland":"United Kingdom",
"USVI":"Trinidad and Tobago",
"Thai":"Thailand",
"Tanzania":'Tanzania, United Republic Of',
"Phuket Thailand":"Thailand",
"Tibet Autonomous Region": "China",
"Choyu, Nyalam, Tibet Autonomous Region":"China",
"Taiwan":"Taiwan, Province of China",
"Taiwan 900":"Taiwan, Province of China",
"Taiwan 251":"Taiwan, Province of China",
"Taiwan 242":"Taiwan, Province of China",
'Taiwan 950':"Taiwan, Province of China",
"Venezuela": "Venezuela, Bolivarian Republic Of",
"Syria": "Syrian Arab Republic",
"Chaguaramas":"Trinidad and Tobago",
"Mulege Baja California":"Mexico",
"The Netherlands":"Netherlands",
"Valencia":"Spain",
"Antilles":"Netherlands Antilles",
"Maroa":"New Zealand",
"Banglamung Chon Buri":"Thailand",
"Trinidad":"Trinidad and Tobago",
"Tobago":"Trinidad and Tobago",
"Bunya County Busoga Province":"Uganda",
"Khanh Hoa":"Vietnam"
}
)
def mapreplace(x,disc):
try:
return([v for k,v in disc.items() if x == k][0])
except:
return(None)
for d in deathtoll:
val = mapreplace(d['Country'],countryreps)
if val:
d['Country'] = val
for d in deathtoll:
d['ISO'] = d.get("ISO","")
def matchiso(deathtoll, ratio=0.75):
for d in deathtoll:
maxlev = 0
for c in corresp:
lev = Levenshtein.ratio(d["Country"].lower(),c['name'].lower())
# Consider Levenshtein.ratio("Australia","Austria") == 0.875
if (lev > maxlev) and (lev > ratio):
maxlev = lev
d['ISO'] = c['code']
matchiso(deathtoll,0.9)
for d in deathtoll:
if d['ISO'] == '':
try:
val = city2country(d['City'])
if val:
d['Country'] = val
except:
pass
matchiso(deathtoll,0.5)
for d in deathtoll:
val = mapreplace(d['Country'],countryreps)
if val:
d['Country'] = val
matchiso(deathtoll,0.5)
deathtoll = [w for w in deathtoll if w["City"]!='']
with open('./output/locs.csv','w') as f:
wtr = csv.writer(f, dialect="excel")
wtr.writerow(["Date","City","Cause of Death","ISO","Country"])
for c in deathtoll:
try:
date = c['Date']
city = c['City']
cod = c['Cause of Death']
ISO = c['ISO']
country = c['Country']
wtr.writerow([date, city, cod, ISO, country])
except:
print('FAILED TO WRITE',r)
matchfail = [w for w in deathtoll if w['ISO']=='']
matched = [w for w in deathtoll if w['ISO']!='']
len(matchfail)
pprint(matchfail[0:10])
#t = set(d['Country'] for d in deathtoll)
#t = set(d['City'] for d in deathtoll)