-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathdisasters.py
244 lines (216 loc) · 9.05 KB
/
disasters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# %%
import polars as pl
import urllib.request
from urllib.request import urlopen
import json
import plotly.express as px
import plotly
import os
# %%
# This is much better than what I was doing. This actually has the right amount of states
counties = pl.read_csv("https://github.com/kjhealy/fips-codes/raw/master/state_and_county_fips_master.csv")\
.filter(pl.col("state") != "NA").drop('name')
# %%
# Just so I don't have to keep downloading it as I tweak things
if os.path.exists('~/Documents/Class/DS460/DisasterDeclarationsSummaries.csv'):
_disasters = pl.read_csv('~/Documents/Class/DS460/DisasterDeclarationsSummaries.csv')
else:
_disasters = pl.read_csv('https://www.fema.gov/api/open/v2/DisasterDeclarationsSummaries.csv')
# %%
# Get relevant disaster data
# Data dictionary
# https://www.fema.gov/openfema-data-page/disaster-declarations-summaries-v2
# Based on the link, and looking at data for hurricane katrina, it seems like the counties
# where these programs were declared determines whether it was actually a serious disaster,
# or if it just kinda passed through there technically. (i.e. apparently katrina technically hit ID)
disasters = _disasters\
.filter(pl.col('ihProgramDeclared').cast(pl.Boolean) | pl.col('iaProgramDeclared').cast(pl.Boolean))\
.select(
'incidentType',
# I WOULD do .round(-1), but SOMEONE doesn't support integer rounding
pl.col('incidentBeginDate').str.to_datetime().dt.year()\
.cut(list(range(1950, 2040, 10)),
labels=list(map(lambda i: f"{i}s", range(1940, 2040, 10))))\
.alias('decade'),
# 'state',
( # This combines the state and county codes propery (ssccc, with padding 0's in county)
pl.col('fipsStateCode').cast(str) + \
pl.col('fipsCountyCode').cast(str).str.rjust(3, '0')
).cast(pl.Int64).alias('fips'),
)\
.filter(pl.col('decade') != "1950s") # The only things in the 50's are some floods. Not hugely relevant.
# %%
# Pivot the disaster data the way we want it
disastersWeCareAbout = [
'Hurricane',
'Flood',
'Tornado',
'Severe Storm',
'Fire',
'Earthquake',
'Winter Storm',
# 'Typhoon', Looks like there's been like, 1 typhoon. Not real concerning.
'Coastal Storm',
]
# Note: These are seperated because we use just longWideDisasters later for graphing
longWideDisasters = disasters.pivot(
values='incidentType',
index=('fips', 'decade'),
columns='incidentType',
aggregate_function='count',
).select('fips', 'decade', *disastersWeCareAbout)\
.fill_null(0)
wideDisasters = longWideDisasters.pivot(
values=disastersWeCareAbout,
index='fips',
columns='decade',
).fill_null(0)
# .drop('Biological', 'Other', 'Snowstorm', 'Dam/Levee Break', 'Freezing', 'Toxic Substances',
# 'Drought', 'Human Cause', 'Fishing Losses', 'Chemical', 'Terrorist', 'Volcanic Eruption',
# 'Severe Ice Storm', 'Mud/Landslide'
# )\
# These don't feel particularly relevant /|\
# After looking at the data, it looks like for Volcanic Eruption, there's Hawaii (obviously),
# and a few in Washington, which I realized must be Mt. St. Helen's. I'm removing it because
# It seems like they know how to deal with them in Hawaii, and Mt. St. Helen's isn't gonna blow
# again anytime soon
# %%
# Add the disasters to the data
labeledCounties = counties.join(wideDisasters, 'fips', how='inner').drop('state')
# %%
# Remove any columns that don't have any in them -- not actually useful anymore
# drop = []
# for c in labeledCounties.columns:
# if labeledCounties[c].dtype == pl.UInt32:
# if not labeledCounties[c].cast(pl.Boolean).any():
# drop.append(c)
# labeledCounties = labeledCounties.drop(drop)
# %%
# We don't actually care about doing this anymore. It's not relevant, especially after
# splitting up into decades
if False:
# If you have occational fires/floods/storms in the county, that's not indicative
# of a larger problem. We want to know if it's at significant risk
# This data does go all the way back to 1964 after all
filtered = labeledCounties.with_columns(
# The thresholds I got from looking at the .describe() stats
# I'm justifying this because after making a grpah, it's not nearly as widespread
# as i assumed. I think they're only counting wildfires after all
(pl.col('Fire') > 0).alias('Fire'),
(pl.col('Flood') > 2).alias('Flood'),
(pl.col('Severe Storm') > 0).alias('Severe Storm'),
# ANY tonadoes are bad
(pl.col('Tornado') > 0).alias('Tornado'),
# By looking at https://www.fema.gov/disaster/how-declared, it looks like
# The disaster has to be approved as a "major disaster" by the president
# before it ends up here (if I'm reading this right), which shoud mean
# the measly California quakes shouldn't be included
(pl.col('Earthquake') > 0).alias('Earthquake'),
(pl.col('Hurricane') > 0).alias('Hurricane'),
(pl.col('Coastal Storm') > 0).alias('Coastal Storm'),
(pl.col('Typhoon') > 0).alias('Typhoon'),
)
# Upon consideration (and several graphs), I'm not sure this is necissary after all.
# I think that just filtering based on the programs above is sufficent.
# %%
# Now write it properly distributed
# labeledCounties.write_parquet('disasters/disasters.parquet', use_pyarrow=True)
# The size each will be
size = len(labeledCounties) // 5
# Split them all up into a list of smaller datasets
split = [labeledCounties[size*i:size*(i+1)] for i in range(4)]
# Becuase we're doing // instead of /, we might have an off-by-one error.
# This fixes that.
split.append(labeledCounties[size*4:])
assert sum(map(len, split)) == len(labeledCounties), "You split the dataset wrong"
# NOW actually write the data
for cnt, data in enumerate(split):
data.write_parquet(f'disasters/disasters_{cnt}.parquet', use_pyarrow=True)
# %%
# Just some quick code to make a long formatted version
longVersion = labeledCounties.drop('state', 'state_right').melt('fips', variable_name='disaster')\
.filter(pl.col('value').cast(pl.Boolean))\
.drop('value')\
.with_columns(pl.col('fips').cast(pl.Utf8).str.rjust(5,'0').alias('fips'))
# %%
# The problem here is that a bunch of counties have multiple disasters they have
# to deal with. That sucks for them, but it also sucks for us, because each county
# can only have 1 color. So what I'm doing here is prioritizing them so only the
# most worrisome is shown
# This is kind of a janky way of doing it, but I spend 2 hours trying to figure
# out how to do this, and this is the only way I could figure out.
# def generateDecadeGraphData(decade):
# decadeData = longWideDisasters.filter(pl.col('decade') == decade)
_priority = [
'Hurricane',
'Tornado',
'Earthquake',
'Fire',
'Flood',
'Severe Storm',
'Coastal Storm',
]
# NO idea why this is necissary. All of the sudden it started giving me a runtime error
cols = longWideDisasters.columns
def prioritize(r):
for p in _priority:
if r[cols.index(p)]:
return r[0], r[1], p
return r[0], r[1], None
prioritized = longWideDisasters\
.map_rows(prioritize)\
.drop_nulls()\
.select(
pl.col('column_0').cast(pl.Utf8).str.rjust(5,'0').alias('fips'),
pl.col('column_1').alias('decade'),
pl.col('column_2').alias('disaster')
)
# graphData = [generateDecadeGraphData(decade) for decade in list(longWideDisasters['decade'].unique())]
# %%
# Get the shapes of all the counties in a geoJSON file
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
countyShapes = json.load(response)
# %%
# A graph of what disaster each county is most worried about
fig = px.choropleth(prioritized,
geojson=countyShapes, locations='fips',
color='disaster', facet_col='decade', facet_col_wrap=4,
# color_discrete_sequence=px.colors.qualitative.G10,
color_discrete_map={
"Tornado": '#550000',
"Hurricane": '#65d6ff',
"Earthquake": '#aa6511',
"Flood": '#0835ff',
"Severe Storm": '#00c106',
"Coastal Storm": '#426a8d',
"Fire": '#e60004',
# "Typhoon": '#a91fc1',
},
scope="usa",
title=f'What Counties had Which Disasters in What Decade',
)
fig.update_layout(
margin={"r":0,"t":40,"l":0,"b":0},
# autosize=False,
# width=2000,
# height=1500,
legend_tracegroupgap=0,
legend_traceorder='grouped',
)
pass
# %%
# img = plotly.io.to_html(fig)
img = plotly.io.to_image(fig, format='png')
# %%
with open('images/disastersOverDecades.png', 'wb') as f:
f.write(img)
# %%
# A graph of just where tornadoes are a problem, cause it's kind interesting
justHurricanes = longVersion.filter(pl.col('disaster') == 'Tornado')
fig = px.choropleth(justHurricanes, geojson=countyShapes, locations='fips',
color='disaster',
color_discrete_sequence=px.colors.qualitative.G10,
scope="usa",
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()