Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor usafacts to use geo utils #316

Merged
merged 11 commits into from
Oct 30, 2020
120 changes: 25 additions & 95 deletions usafacts/delphi_usafacts/geo.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,10 @@
# -*- coding: utf-8 -*-
import pandas as pd

from delphi_utils import GeoMapper

INCIDENCE_BASE = 100000
# https://code.activestate.com/recipes/577775-state-fips-codes-dict/
STATE_TO_FIPS = {
"WA": "53",
"DE": "10",
"DC": "11",
"WI": "55",
"WV": "54",
"HI": "15",
"FL": "12",
"WY": "56",
"PR": "72",
"NJ": "34",
"NM": "35",
"TX": "48",
"LA": "22",
"NC": "37",
"ND": "38",
"NE": "31",
"TN": "47",
"NY": "36",
"PA": "42",
"AK": "02",
"NV": "32",
"NH": "33",
"VA": "51",
"CO": "08",
"CA": "06",
"AL": "01",
"AR": "05",
"VT": "50",
"IL": "17",
"GA": "13",
"IN": "18",
"IA": "19",
"MA": "25",
"AZ": "04",
"ID": "16",
"CT": "09",
"ME": "23",
"MD": "24",
"OK": "40",
"OH": "39",
"UT": "49",
"MO": "29",
"MN": "27",
"MI": "26",
"RI": "44",
"KS": "20",
"MT": "30",
"MS": "28",
"SC": "45",
"KY": "21",
"OR": "41",
"SD": "46",
}

SECONDARY_FIPS = [
("51620", ["51093", "51175"]),
("51685", ["51153"]),
Expand All @@ -76,32 +23,11 @@
("46102", "46113"),
]

FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}


def fips_to_state(fips: str) -> str:
"""Wrapper that handles exceptions to the FIPS scheme in the USAFacts data.

All the county FIPS codes are mapped to state by taking the first two
digits of the five digit, zero-padded county FIPS and applying
FIPS_TO_STATE to map it to the two-letter postal abbreviation.

Parameters
----------
fips: str
Five digit, zero padded county FIPS code

Returns
-------
str
Two-letter postal abbreviation, lower case.

Raises
------
KeyError
Inputted FIPS code not recognized.
"""
return FIPS_TO_STATE[fips[:2]]
# Valid geographical resolutions output by this indicator.
VALID_GEO_RES = ("county", "state", "msa", "hrr")
# Sensors that report proportions. For geo resolutions with unallocated cases
# or deaths, we avoid reporting these sensors.
PROP_SENSORS = ("incidence", "cumulative_prop")


def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
Expand All @@ -121,9 +47,9 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
pd.DataFrame
Dataframe with same schema as df, with the counts disbursed.
"""
COLS = ["new_counts", "cumulative_counts"]
cols = ["new_counts", "cumulative_counts"]
df = df.copy().sort_values(["fips", "timestamp"])
for col in COLS:
for col in cols:
# Get values from the aggregated county:
vals = df.loc[df["fips"] == pooled_fips, col].values / len(fips_list)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small nitpick, but would be good to standardize single vs double quotes within the file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

for fips in fips_list:
Expand All @@ -142,7 +68,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
Columns: fips, timestamp, new_counts, cumulative_counts, population ...
geo_res: str
Geographic resolution to which to aggregate. Valid options:
('county', 'state', 'msa', 'hrr').
("county", "state", "msa", "hrr").
map_df: pd.DataFrame
Loaded from static file "fips_prop_pop.csv".
sensor: str
Expand All @@ -155,29 +81,31 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
pd.DataFrame
Columns: geo_id, timestamp, ...
"""
VALID_GEO_RES = ("county", "state", "msa", "hrr")
#It is not clear how to calculate the proportion for unallocated cases/deaths
PROP_SENSORS = ("incidence", "cumulative_prop")
if geo_res not in VALID_GEO_RES:
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")

df_mega = df[df['fips'].astype(int) % 1000 == 0].copy()
# State-level records unassigned to specific counties are coded as fake
# counties with fips XX000.
unassigned_counties = df[df["fips"].str.endswith("000")].copy()

df = df[df['fips'].astype(int) % 1000 != 0].copy()
df = df[df["fips"].astype(int) % 1000 != 0].copy()
# Disburse unallocated cases/deaths in NYC to NYC counties
df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1])
df = df[df['fips'] != NYC_FIPS[0][0]]
df = df[df["fips"] != NYC_FIPS[0][0]]

if geo_res == "county":
if sensor not in PROP_SENSORS:
df = df.append(df_mega)
# It is not clear how to calculate the proportion for unallocated
# cases/deaths, so we exclude them for those sensors.
df = df.append(unassigned_counties)
df["geo_id"] = df["fips"]
elif geo_res == "state":
# Grab first two digits of fips
# Map state fips to us postal code
# Add unallocated cases/deaths
df = df.append(df_mega)
df["geo_id"] = df["fips"].apply(fips_to_state)
df = df.append(unassigned_counties)
geo_mapper = GeoMapper()
df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id")
elif geo_res in ("msa", "hrr"):
# Map "missing" secondary FIPS to those that are in our canonical set
for fips, fips_list in SECONDARY_FIPS:
Expand All @@ -189,12 +117,14 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
map_df["geo_id"] = map_df[colname].astype(int)
df["fips"] = df["fips"].astype(int)
merged = pd.merge(df, map_df, on="fips")
merged["cumulative_counts"] = merged["cumulative_counts"] * merged["pop_prop"]
merged["cumulative_counts"] =\
merged["cumulative_counts"] * merged["pop_prop"]
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
merged["population"] = merged["population"] * merged["pop_prop"]
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
df = df.drop("fips", axis=1)
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
df["cumulative_prop"] =\
df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
return df
6 changes: 4 additions & 2 deletions usafacts/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
@pytest.fixture(scope="session")
def run_as_module():
# Clean receiving directory
for fname in listdir("receiving"):
remove(join("receiving", fname))
for fname in listdir("../receiving"):
if fname[0] == ".":
continue
remove(join("../receiving", fname))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be made into a one liner with

[remove(fname) for fname in glob.glob('receiving/*')]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops realized this is part of #314


run_module()
42 changes: 17 additions & 25 deletions usafacts/tests/test_geo.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,22 @@
import pytest

from os.path import join

import pytest

import numpy as np
import pandas as pd
from delphi_usafacts.geo import fips_to_state, disburse, geo_map
from delphi_usafacts.geo import disburse, geo_map

MAP_DF = pd.read_csv(
join("..", "static", "fips_prop_pop.csv"),
dtype={"fips": int}
)

sensor = "new_counts"
class TestFipsToState:

def test_normal(self):

assert fips_to_state("53003") == "wa"
assert fips_to_state("48027") == "tx"
assert fips_to_state("12003") == "fl"
assert fips_to_state("50103") == "vt"
assert fips_to_state("15003") == "hi"

SENSOR = "new_counts"

class TestDisburse:
"""Tests for the `geo.disburse()` function."""
def test_even(self):

"""Tests that values are disbursed evenly across recipients."""
df = pd.DataFrame(
{
"fips": ["51093", "51175", "51620"],
Expand All @@ -43,8 +34,9 @@ def test_even(self):


class TestGeoMap:
"""Tests for `geo.geo_map()`."""
def test_incorrect_geo(self):

"""Tests that an invalid resolution raises an error."""
df = pd.DataFrame(
{
"fips": ["53003", "48027", "50103"],
Expand All @@ -56,10 +48,10 @@ def test_incorrect_geo(self):
)

with pytest.raises(ValueError):
geo_map(df, "département", MAP_DF, sensor)
geo_map(df, "département", MAP_DF, SENSOR)

def test_county(self):

"""Tests that values are correctly aggregated at the county level."""
df = pd.DataFrame(
{
"fips": ["53003", "48027", "50103"],
Expand All @@ -70,7 +62,7 @@ def test_county(self):
}
)

new_df = geo_map(df, "county", MAP_DF, sensor)
new_df = geo_map(df, "county", MAP_DF, SENSOR)

exp_incidence = df["new_counts"] / df["population"] * 100000
exp_cprop = df["cumulative_counts"] / df["population"] * 100000
Expand All @@ -81,7 +73,7 @@ def test_county(self):
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)

def test_state(self):

"""Tests that values are correctly aggregated at the state level."""
df = pd.DataFrame(
{
"fips": ["04001", "04003", "04009", "25023"],
Expand All @@ -92,7 +84,7 @@ def test_state(self):
}
)

new_df = geo_map(df, "state", MAP_DF, sensor)
new_df = geo_map(df, "state", MAP_DF, SENSOR)

exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
Expand All @@ -106,7 +98,7 @@ def test_state(self):
assert (new_df["cumulative_prop"].values == exp_cprop).all()

def test_hrr(self):

"""Tests that values are correctly aggregated at the HRR level."""
df = pd.DataFrame(
{
"fips": ["13009", "13017", "13021", "09015"],
Expand All @@ -117,7 +109,7 @@ def test_hrr(self):
}
)

new_df = geo_map(df, "hrr", MAP_DF, sensor)
new_df = geo_map(df, "hrr", MAP_DF, SENSOR)

exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
Expand All @@ -131,7 +123,7 @@ def test_hrr(self):
assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)

def test_msa(self):

"""Tests that values are correctly aggregated at the MSA level."""
df = pd.DataFrame(
{
"fips": ["13009", "13017", "13021", "09015"],
Expand All @@ -142,7 +134,7 @@ def test_msa(self):
}
)

new_df = geo_map(df, "msa", MAP_DF, sensor)
new_df = geo_map(df, "msa", MAP_DF, SENSOR)

exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
Expand Down
4 changes: 2 additions & 2 deletions usafacts/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class TestRun:
def test_output_files_exist(self, run_as_module):

csv_files = listdir("receiving")
csv_files = listdir("../receiving")

dates = [
"20200229",
Expand Down Expand Up @@ -48,6 +48,6 @@ def test_output_files_exist(self, run_as_module):
def test_output_file_format(self, run_as_module):

df = pd.read_csv(
join("receiving", "20200310_state_confirmed_cumulative_num.csv")
join("../receiving", "20200310_state_confirmed_cumulative_num.csv")
)
assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
4 changes: 2 additions & 2 deletions usafacts/tests/test_smooth.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ def test_output_files_smoothed(self, run_as_module):
dates = [str(x) for x in range(20200304, 20200311)]

smoothed = pd.read_csv(
join("receiving",
join("../receiving",
f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
)

raw = pd.concat([
pd.read_csv(
join("receiving",
join("../receiving",
f"{date}_state_confirmed_cumulative_num.csv")
) for date in dates
])
Expand Down