forked from PeruData/ENAHO
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scrape Enaho.py
243 lines (230 loc) · 9.81 KB
/
Scrape Enaho.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
###########################################################################################################
# Peruvian Households Dataset
# Author: Sebastian Sardon
# Last updated: June 16, 2019
# Retrieves raw ENAHO data from INEI's official website
# Reference period: 1997-2018 (these are all the years for which complete surveys are available)
###########################################################################################################
#import dbf
import glob
import numpy as np
import pandas as pd
import pathlib
import os
import re
import shutil
import time
import zipfile
from simpledbf import Dbf5
from urllib.request import urlretrieve
#Codes for surveys of the class "ENAHO Metodología ACTUALIZADA"
#These are rather unstructured, so codes are obtained manually from INEI's webpage
survey_codes = {
"enaho_1997": "04", "enaho_1998": "08",
"enaho_1999": "13", "enaho_2000": "30",
"enaho_2001": "52", "enaho_2002": "91",
"enaho_2003": "31", "enaho_2004": "280",
"enaho_2005": "281", "enaho_2006": "282",
"enaho_2007": "283", "enaho_2008": "284",
"enaho_2009": "285", "enaho_2010": "279",
"enaho_2011": "291", "enaho_2012": "324",
"enaho_2013": "404", "enaho_2014": "440",
"enaho_2015": "498", "enaho_2016": "546",
"enaho_2017": "603", "enaho_2018": "634"
}
mod_codes = ["01","02","03","05","34","85"]
root = "/Users/Sebastian/Downloads/temp"
os.chdir(root)
try:
os.mkdir("Trash")
except:
print("Folder already exists")
#1. Scrape zip files
start_time = time.time()
errors = []
for yy in range(1997,2019):
for mod_code in mod_codes:
if yy < 2004: kind = "DBF"
else: kind = "STATA"
print("retrieving data for year {0} - module {1}".format(yy, mod_code))
url = "http://iinei.inei.gob.pe/iinei/srienaho/descarga/{0}/{1}-Modulo{2}.zip".format(kind,survey_codes["enaho_{0}".format(yy)], mod_code)
try:
urlretrieve(url, "Trash/module {0} {1}.zip".format(mod_code,yy))
except:
if yy <2003 and mod_code == "85":
print("module 85 not available for year {0}".format(yy))
else:
print("ERROR")
errors.append(url)
print("Scraping complete. {0} errors:".format(len(errors)))
print(errors)
ellapsed = time.time() - start_time
print("This takes {0}s".format(ellapsed))
#Around 568s (=9 min)
#2. Extract zip files
start_time = time.time()
errors = []
try:
shutil.rmtree("Enaho")
os.mkdir("Enaho")
except:
os.mkdir("Enaho")
os.mkdir("Enaho/in")
os.mkdir("Enaho/in/Raw Data")
for mod_code in mod_codes:
new_dir = "Enaho/in/Raw Data/module {0}".format(mod_code)
try:
shutil.rmtree(new_dir)
os.mkdir(new_dir)
except:
os.mkdir(new_dir)
for yy in range(1997,2019):
for mod_code in mod_codes:
if yy <2003 and mod_code == "85":
print("module 85 not available for this year")
continue
new_dir = "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy)
try:
shutil.rmtree(new_dir)
os.mkdir(new_dir)
except:
os.mkdir(new_dir)
print("extracting data for year {0} - module {1}".format(yy, mod_code))
zip_ref = zipfile.ZipFile("Trash/module {0} {1}.zip".format(mod_code, yy))
for file_name in zip_ref.namelist():
try:
zip_ref.extract(file_name, "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy))
except:
print("could not extract {0} for year {1}".format(file_name, yy))
errors.append(file_name)
zip_ref.close()
print("Scrapping complete. {0} errors:".format(len(errors)))
print(errors)
ellapsed = time.time() - start_time
print("This takes {0}s".format(ellapsed))
#Around 30s (=1 min)
#3. Remove redundant enclosing folder (only a problem for some years)
start_time = time.time()
errors = []
for yy in range(1997,2019):
for mod_code in mod_codes:
if yy <2003 and mod_code == "85":
print("module 85 not available for this year")
continue
file_mod_yy = "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy)
file_tree = []
for branch in os.walk(file_mod_yy):
file_tree.append(branch)
if len(file_tree[0][1]) > 0: #only do this if enclosing folders exist
for file in file_tree[1][2]: #get all the enclosed files
file_path = file_mod_yy + "/" + file_tree[0][1][0] + "/" + file
try:
shutil.move(file_path, file_mod_yy + "/" + file)
except:
print("could not move {0}".format(file_path))
errors.append(file_path)
shutil.rmtree(file_mod_yy + "/" + file_tree[0][1][0])
print("Structuring complete. {0} errors:".format(len(errors)))
print(errors)
ellapsed = time.time() - start_time
print("This takes {0}s".format(ellapsed))
#Less than 1s (=0 min)
#4. Convert files for 1997-2003 ("ANTERIOR" class) from dbf to dta
#Exception: module 05 files for years 2001-2003 are split into two dbf files (E1)
def check_E1():
return (mod_code=="05" and (yy == 2001 or yy == 2002 or yy == 2003))
start_time = time.time()
errors=[]
for yy in range(1997,2019):
for mod_code in mod_codes:
print("----")
print(yy)
if yy <2003 and mod_code == "85":
print("module 85 not available for this year")
continue
os.chdir(root + "/" + "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy))
dbf_list = glob.glob("*.dbf".format(yy)) + glob.glob("*.DBF")
if check_E1() is False:
for file in dbf_list:
try:
print("working on {0}".format(file))
dbf_fn = "{0}".format(file)
dta_fn = dbf_fn.split(".dbf")[0] + ".dta"
df = Dbf5(dbf_fn).to_dataframe()
df.columns = [column.lower() for column in df.columns]
df.columns = [column.replace("\x00", "") for column in df.columns]
df.columns = [column.replace(" ", "") for column in df.columns]
df.to_stata(dta_fn)#df.to_stata(dta_fn, encoding = "latin1")
print("{0} converted to dta".format(file))
#os.remove(dbf_fn)
except:
print("{0} bugged, must convert manually".format(file))
dbf_path = "module {0}/{1}".format(mod_code, yy) + dbf_fn
errors.append(dbf_path)
else:
file1 = dbf_list[0]
file2 = dbf_list[1]
try:
print("working on {0} and {1}".format(file1, file2))
dbf1_fn = "{0}".format(file1)
dbf2_fn = "{0}".format(file2)
dta1_fn = "{0}-1".format(yy) + ".dta"
dta2_fn = "{0}-2".format(yy) + ".dta"
dta_fn = "{0}".format(yy) + ".dta"
df1 = Dbf5(dbf1_fn).to_dataframe()
df2 = Dbf5(dbf2_fn).to_dataframe()
for current_df,current_dta in zip([df1, df2], [dta1_fn, dta2_fn]):
print("{0}".format(current_df))
print("{0}".format(current_dta))
current_df.columns = [column.lower() for column in current_df.columns]
current_df.columns = [column.replace("\x00", "") for column in current_df.columns]
current_df.columns = [column.replace(" ", "") for column in current_df.columns]
current_df.to_stata(current_dta)#current_df.to_stata(current_dta, encoding = "latin1")
merge_vars = ["conglome","vivienda","hogar","codperso"]
df = df1.merge(df2, on = merge_vars)
df.to_stata(dta_fn)#df.to_stata(dta_fn, encoding = "latin1")
print("{0} converted to dta".format(current_df))
#os.remove(dbf_fn)
except:
print("{0} bugged, must convert manually".format())
dbf_path = "module {0}/{1}".format(mod_code, yy) + dbf_fn
errors.append(dbf_path)
print("Data conversion complete. {0} errors:".format(len(errors)))
print(errors)
ellapsed = time.time() - start_time
print("This takes {0}s".format(ellapsed))
#380s (=6 min)
#5. Rename dta files for ease of looping
start_time = time.time()
errors=[]
for yy in range(1997,2019):
for mod_code in mod_codes:
if mod_code == "85": #we deal with files from this module in a special way: convert the "yy-1" dataset
if yy<2003:
print("module 85 not available for this year")
continue
else:
os.chdir(root + "/" + "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy))
dta_files = glob.glob("*{0}-1.dta".format(yy))
print(dta_files)
os.rename(dta_files[0],"{0}.dta".format(yy))
else: #general criterion: rename (for use in Stata) the biggest file, ignore others
print("doing mod {0} - year {1}".format(mod_code, yy))
os.chdir(root + "/" + "Enaho/in/Raw Data/module {0}/{1}".format(mod_code, yy))
dta_files = glob.glob("*.dta") + glob.glob("*.DTA")
sizes = [os.path.getsize(file) for file in dta_files]
index_max = np.argmax(sizes)
os.rename(dta_files[index_max],"{0}.dta".format(yy))
print("Renames complete. {0} errors:".format(len(errors))) #See Section 2
ellapsed = time.time() - start_time
print("This takes {0}s".format(ellapsed))
#Less than 1s
#6 Remove Trash
os.chdir(root)
for yy in range(1997,2019):
for mod_code in mod_codes:
zip_file = "Trash/module {0} {1}.zip".format(mod_code, yy)
try:
os.remove(zip_file)
except:
print("no temporary files found for year {0} - module {1}".format(yy, mod_code))