-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-preprocessing.py
321 lines (246 loc) · 13.6 KB
/
data-preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# Library Imports
import fiona
import rtree
import shapely
from shapely.geometry import shape
import geopandas as gpd
import pandas as pd
import numpy as np
import subprocess
import os
import requests
from bs4 import BeautifulSoup
# Shapely library to help with calculation
# of the representative centroid position
from shapely.geometry import MultiPoint
# default dictionary (a dictionary with a default value if a key doesn't exist)
from collections import defaultdict
# To unzip file
import zipfile
# To have progress bar
from tqdm import tqdm
import json
# Helper function to create a new folder
def mkdir(path):
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
else:
print("(%s) already exists" % (path))
# Main function
if __name__ == '__main__':
################################ CONSTANTS ################################
print('Initializing constants...')
# Buffer radius
BUFFER = 40233.6 # 25 miles in metres
# Census tracts shapefiles url
ct_shape_url = 'https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html'
# Census tracts data url from 2012 - 2017
ct_file_name = 'acs_5_year_estimates_census_tracts.csv'
ct_data_url = 'https://www.dropbox.com/s/ni28x7mw6uh00dg/' + ct_file_name + '.zip?dl=1'
# American University Data
au_file_name = 'IPEDS_data.xlsx'
au_data_url = 'https://public.tableau.com/s/sites/default/files/media/Resources/' + au_file_name
# Directory of datasets
DATASETS_PATH = 'datasets/'
# Directory of census tract shapefile data
CENSUS_TRACTS_PATH = DATASETS_PATH + 'census_tracts/'
# Make the directory for the census tracts shapefiles data
mkdir(DATASETS_PATH)
################################ CENSUS TRACT FEATURES DATA ################################
print('Downloading census tract features data...')
# Download CENSUS TRACT SHAPEFILES data
if not os.path.isfile(DATASETS_PATH + ct_file_name):
os.system('!wget --directory-prefix={} -Nq {}'.format(DATASETS_PATH, ct_data_url))
# Unzipping the file
zip_ref = zipfile.ZipFile(DATASETS_PATH + ct_file_name + '.zip', 'r')
zip_ref.extractall(DATASETS_PATH + ct_file_name + '/')
zip_ref.close()
# Remove the old census tract .zip shapefile
subprocess.call(['rm', '-rf', DATASETS_PATH + ct_file_name + '.zip'])
# Store data in census_tracts
census_tracts = pd.read_csv(DATASETS_PATH + ct_file_name, encoding='ISO-8859-1', low_memory=False)
################################ UNIVERSITY DATA ################################
print('Downloading university data...')
# Download AMERICAN UNIVERSITIES data
if not os.path.isfile(DATASETS_PATH + au_file_name):
os.system('!wget --directory-prefix={} -Nq {}'.format(DATASETS_PATH, au_data_url))
# Store data in universities
universities = pd.read_excel(DATASETS_PATH + au_file_name, index_col='ID number')
# List of tuples to store all the university
# locations as Shapely Points
uni_list = []
for idx, id_num in enumerate(universities.index):
uni_list.append((id_num, shape({"type": "Point",
"coordinates": (universities.loc[id_num, 'Longitude location of institution'],
universities.loc[id_num, 'Latitude location of institution'])})))
# Setting our coordinate system for the tracts
# and universities to EPSG:2163 https://epsg.io/2163
uni_gdf = gpd.GeoDataFrame(geometry=[uni_pt for uniid, uni_pt in uni_list], index=[uniid for uniid, uni_pt in uni_list])
uni_gdf.crs = {'init': 'epsg:4269'}
uni_gdf.to_crs({'init': 'epsg:2163'}, inplace=True);
# ----------------------------
# Initialize rtree spatial index
# with just its centroid
uni_index = rtree.index.Index()
# Iterate over all american universities
for idx, (id_num, uni) in tqdm(enumerate(uni_gdf.iterrows())):
# add coordinates of univeristy location and store id along with it
uni_index.insert(idx, coordinates=uni.geometry.bounds, obj=id_num)
################################ CENSUS TRACT SHAPE FILES ################################
print('Downloading census tract shapefiles data...')
# Make request to get the webpage
r = requests.get(ct_shape_url)
soup = BeautifulSoup(r.content, "html.parser")
# Get the download links from the dropdown <option> tag
locations = soup.find('select',
{'name':'Location',
'id':'ct2017m'}).findChildren('option' , recursive=False)[1:]
# Put all the states and the urls for their shape files in a dictionary
state_urls = {location.text.strip() : location.attrs['value'] for location in locations}
# Download data
if not os.path.isdir(CENSUS_TRACTS_PATH):
# Make the directory for the census tracts shapefiles data
mkdir(CENSUS_TRACTS_PATH)
for state, state_url in state_urls.items():
os.system('wget --directory-prefix={} -Nq {}'.format(CENSUS_TRACTS_PATH, state_url))
# Storing the shape file names
census_tract_shapefiles = []
for state, state_url in state_urls.items():
# Extracting the name of shapefile
shapefile = state_url[state_url.rindex('/') + 1:]
census_tract_shapefiles.append(shapefile)
# Renaming the file
os.rename(CENSUS_TRACTS_PATH + shapefile, CENSUS_TRACTS_PATH + state + '.zip')
# Unzipping the file
zip_ref = zipfile.ZipFile(CENSUS_TRACTS_PATH + state + '.zip', 'r')
zip_ref.extractall(CENSUS_TRACTS_PATH + state + '/')
zip_ref.close()
# Remove the old census tract .zip shapefile
subprocess.call(['rm', '-rf', CENSUS_TRACTS_PATH + state + '.zip'])
# ----------------------------
# Let's calculate and store each of
# the centroids of each
# census tract in a list of tuples
tract_centroids = []
# ----------------------------
# Let's store a list of each census
# tract's geometry as well
tract_list = []
for subdir, dirs, files in list(os.walk(CENSUS_TRACTS_PATH))[1:]:
# Opening the shapefile
state_shapes = fiona.open(subdir, 'r')
# Looping through each census tract in each state and
# making key: geoid, value: centroid position
# of (longitude, latitude) as well as building the
# RTree index for census tracts
for idx, census_tract in enumerate(state_shapes):
geoid = census_tract['properties']['GEOID']
geometry = np.array(census_tract['geometry']['coordinates'])
points = None
# Some of the geometries are in a 2d and some in 3d array
if len([True for lat_long in geometry[0] if len(lat_long) != 2]) > 0:
# Create the Multipoint object to find centroid
points = MultiPoint(geometry[0][0])
else:
# Create the Multipoint object to find centroid
points = MultiPoint(geometry[0])
# ----------------------------
# Create Census tract Polygon
tract = shapely.geometry.shape(census_tract['geometry'])
# ----------------------------
# Store the Census tract Polygon
# in our list for the RTree index
# later on
tract_list.append((geoid, tract))
# ----------------------------
# Create Census tract Centroid
# True centroid, not necessarily an existing point
centroid_pt = points.centroid
# A represenative point, not centroid,
# that is guarnateed to be with the geometry
tract_centroids.append((geoid, shapely.geometry.shape(points.representative_point())))
# Setting our coordinate system for the tracts
# and universities to EPSG:2163 https://epsg.io/2163
tract_gdf = gpd.GeoDataFrame(geometry=[tract_shape for geoid, tract_shape in tract_list], index=[geoid for geoid, tract_pt in tract_list])
tract_gdf.crs = {'init': 'epsg:4269'}
tract_gdf.to_crs({'init': 'epsg:2163'}, inplace=True)
tract_centroids_gdf = gpd.GeoDataFrame(geometry=[tract_pt for geoid, tract_pt in tract_centroids], index=[geoid for geoid, tract_pt in tract_centroids])
tract_centroids_gdf.crs = {'init': 'epsg:4269'}
tract_centroids_gdf.to_crs({'init': 'epsg:2163'}, inplace=True)
# ----------------------------
# Initialize rtree spatial index
# for census tract centroids with
# CENTROID point bounding box
tract_centroid_index = rtree.index.Index()
# Iterate over all census tract centroids
for idx, (geoid, centroid) in tqdm(enumerate(tract_centroids_gdf.iterrows())):
# add coordinates of census tract and store geoid along with it
tract_centroid_index.insert(idx, coordinates=centroid.geometry.bounds, obj=geoid)
################################ CENSUS TRACTS WITHIN BUFFER RADIUS ################################
print('Finding census tracts that are within buffer radius...')
# Dictionary of Key: GeoID
# Value: List(GeoID)
# Contains which census tract centroids are accessible to
# that census tract within BUFFER radius
tracts_in_buffer = defaultdict(list)
# 1. Go through each census tract centroid
# 2. Create a BOUNDING BOX of BUFFER radius around that centroid
# 3. Find which centroids are inside that BOX
# 4. Check if that centroid exists in the BUFFER CIRCLE of the census tract
for idx, (geoid, centroid) in tqdm(enumerate(tract_centroids_gdf.iterrows())):
tracts_in_buffer[geoid] = []
# get iterable (generator) of indices from RTree index of census tracts centroids
# that are contained within a BUFFER BOX of the current centroid
# http://toblerity.org/rtree/tutorial.html#intersection
for overlapping_ind in tract_centroid_index.intersection(centroid.geometry.buffer(BUFFER).bounds):
# Check if current tract centroid's BUFFER CIRCLE
# contains the overlapping index's CENTROID
if tract_centroids_gdf.iloc[overlapping_ind].name != geoid and centroid.geometry.buffer(BUFFER).contains(shapely.geometry.shape(tract_centroids_gdf.iloc[overlapping_ind].geometry)):
# add the overlapping_ind's geoID of the census tract into the current census tract's list
# of census tracts accessible within buffer radius
tracts_in_buffer[geoid].append(tract_centroids_gdf.iloc[overlapping_ind].name)
# Store centroid BUFFER dict to file
print('Saving to ./datasets/tracts_in_buffer.json...')
with open('./datasets/tracts_in_buffer.json', 'w') as fp:
json.dump(tracts_in_buffer, fp)
################################ UNIVERSITIES WITHIN BUFFER RADIUS ################################
print('Finding universities that are within buffer radius...')
# Dictionary of Keys: geoId
# Value: List(id_num of university)
# Contains which universities are accessible to
# that census tract within BUFFER radius
tract_universities = defaultdict(list)
# 1. Go through each census tract centroid
# 2. Create a BOUNDING BOX of BUFFER radius around that centroid
# 3. Find which universities are inside that BOX
# 4. Check if that university exists in the BUFFER CIRCLE of the census tract
for idx, (geoid, centroid) in tqdm(enumerate(tract_centroids_gdf.iterrows())):
tract_universities[geoid] = []
# get iterable (generator) of indices from RTree index of universities
# that are contained within a BUFFER BOX of the current centroid
# http://toblerity.org/rtree/tutorial.html#intersection
for overlapping_ind in uni_index.intersection(centroid.geometry.buffer(BUFFER).bounds):
# Check if current tract centroid's BUFFER CIRCLE
# contains the overlapping UNIVERSITY index's centre point
if centroid.geometry.buffer(BUFFER).contains(shapely.geometry.shape(uni_gdf.iloc[overlapping_ind].geometry)):
# add the overlapping_ind's geoID of the census tract into the current census tract's list
# of census tracts accessible within buffer radius
tract_universities[geoid].append(str(uni_gdf.iloc[overlapping_ind].name))
# Store centroid BUFFER dict of universities to file
print('Saving to ./datasets/tract_universities.json...')
with open('./datasets/tract_universities.json', 'w') as fp:
json.dump(tract_universities, fp)
# Create a dictionary of number of universities accessible to each census tract
num_accessible_unis = {geoid: len(unis) for geoid, unis in tract_universities.items()}
# Creating Pandas Dataframe from dictionary
education_deserts = pd.DataFrame.from_dict(num_accessible_unis, orient='index', columns=['Number of Accessible Universities'])
education_deserts['Education Desert'] = 0
education_deserts.loc[education_deserts['Number of Accessible Universities'] == 0, 'Education Desert'] = 1
print('Number of Education Deserts in our dataset: {}'.format(len(education_deserts[education_deserts['Education Desert'] == 1])))
print('Number of Non-Education Deserts in our dataset: {}'.format(len(education_deserts[education_deserts['Education Desert'] == 0])))
# Write to csv file
print('Saving to ./datasets/education_deserts.csv...')
education_deserts.to_csv(r'./datasets/education_deserts.csv')