Skip to content

Commit

Permalink
Add handling of unassigned cases/deaths to jhu
Browse files Browse the repository at this point in the history
Co-authored-by: krivard <krivard@cs.cmu.edu>
  • Loading branch information
Jingjing Tang and krivard committed Jul 1, 2020
1 parent 6cb7310 commit 5ff04c0
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 26 deletions.
31 changes: 28 additions & 3 deletions jhu/delphi_jhu/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@

FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}

# Fake fips to States

JHU_FAKE_FIPS_TO_MEGA_FIPS = {f'900{x}' : f'{x}000' for x in STATE_TO_FIPS.values()}


def fips_to_state(fips: str) -> str:
"""Wrapper that handles exceptions to the FIPS scheme in the JHU data.
Expand Down Expand Up @@ -148,7 +152,7 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
return df


def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
"""
Maps a DataFrame df, which contains data at the county resolution, and
aggregate it to the geographic resolution geo_res.
Expand All @@ -162,22 +166,38 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
('county', 'state', 'msa', 'hrr').
map_df: pd.DataFrame
Loaded from static file "fips_prop_pop.csv".
sensor: str
sensor type. Valid options:
("new_counts", "cumulative_counts",
"incidence", "cumulative_prop")
Returns
-------
pd.DataFrame
Columns: geo_id, timestamp, ...
"""
VALID_GEO_RES = ("county", "state", "msa", "hrr")
#It is not clear to calculate the proportion for unassigned cases/deaths
PROP_SENSORS = ("incidence", "cumulative_prop")
if geo_res not in VALID_GEO_RES:
raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
df = df.copy()

df_mega = df[df['fips'].astype(int) >= 90001].copy()
df_mega['geo_id'] = df_mega['fips'].apply(lambda x: JHU_FAKE_FIPS_TO_MEGA_FIPS[x])

df = df[df['fips'].astype(int) < 90001].copy()

if geo_res == "county":
df["geo_id"] = df["fips"]
if sensor not in PROP_SENSORS:
df = df.append(df_mega)
elif geo_res == "state":
# Grab first two digits of fips
# Map state fips to us postal code
df["geo_id"] = df["fips"].apply(fips_to_state)
df["geo_id"] = df["fips"]
# Add unassigned cases/deaths
df = df.append(df_mega)
df["geo_id"] = df["geo_id"].apply(fips_to_state)
elif geo_res in ("msa", "hrr"):
# Disburse Dukes & Nantucket to individual counties
df = disburse(df, DN_FIPS, DN_COUNTY_FIPS)
Expand All @@ -200,8 +220,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame):
merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
merged["population"] = merged["population"] * merged["pop_prop"]
df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
# if sensor not in PROP_SENSORS:
# df_mega["geo_id"] = df_mega["geo_id"].apply(fips_to_state)
# df = df.append(df_mega)
df = df.drop("fips", axis=1)
df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()

# Value would be negative for megacounties , which would not be considered in the main function
df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
return df
8 changes: 6 additions & 2 deletions jhu/delphi_jhu/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
MIN_FIPS = 1000
MAX_FIPS = 57000
EXTRA_FIPS = (
72, # Puerto Rico (provided as the entire state)
72, # Puerto Rico (provided as the entire state)
70002, # Kansas City, MO
70003, # Dukes and Nantucket Counties, MA
)
Expand All @@ -79,9 +79,13 @@ def pull_jhu_data(base_url: str, metric: str, pop_df: pd.DataFrame) -> pd.DataFr
& (df["FIPS"] < MAX_FIPS)
) # "Uncategorized", etc.
| df["FIPS"].isin(EXTRA_FIPS)
# Get Fake FIPS for unassigned cases
| np.logical_and(df['FIPS'] >= 90001,
df['FIPS'] <= 90056)
]
# Merge in population LOWERCASE, consistent across confirmed and deaths
df = pd.merge(df, pop_df, on="FIPS")
# Set population as NAN for fake fips
df = pd.merge(df, pop_df, on="FIPS", how='left')

# Manual correction for PR
df.loc[df["FIPS"] == 72, "FIPS"] = 72000
Expand Down
2 changes: 1 addition & 1 deletion jhu/delphi_jhu/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def run_module():
print(geo_res, metric, sensor, smoother)
df = dfs[metric]
# Aggregate to appropriate geographic resolution
df = geo_map(df, geo_res, map_df)
df = geo_map(df, geo_res, map_df, sensor)
df["val"] = SMOOTHERS_MAP[smoother][0](df[sensor].values)
df["se"] = np.nan
df["sample_size"] = np.nan
Expand Down
120 changes: 120 additions & 0 deletions jhu/tests/receiving/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# You should hard commit a prototype for this file, but we
# want to avoid accidental adding of API tokens and other
# private data parameters
params.json

# Do not commit output files
receiving/*.csv

# Remove macOS files
.DS_Store

# virtual environment
dview/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
coverage.xml
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
.static_storage/
.media/
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
93 changes: 74 additions & 19 deletions jhu/tests/test_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ def test_normal(self):
assert fips_to_state("12003") == "fl"
assert fips_to_state("50103") == "vt"
assert fips_to_state("15003") == "hi"

def test_mega(self):

assert fips_to_state("01000") == "al"
assert fips_to_state("13000") == "ga"
assert fips_to_state("44000") == "ri"
assert fips_to_state("12000") == "fl"


class TestDisburse:
Expand Down Expand Up @@ -60,7 +67,7 @@ def test_incorrect_geo(self):
)

with pytest.raises(ValueError):
geo_map(df, "département", MAP_DF)
geo_map(df, "département", MAP_DF, 'new_counts')

def test_county(self):

Expand All @@ -74,15 +81,27 @@ def test_county(self):
}
)

new_df = geo_map(df, "county", MAP_DF)
df_mega = pd.DataFrame(
{
"fips": ["90013", "90001"],
"timestamp": ["2020-02-15", "2020-02-15"],
"new_counts": [8, 2],
"cumulative_counts": [80, 12],
"population": [np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "county", MAP_DF, 'new_counts')

exp_incidence = df["new_counts"] / df["population"] * 100000
exp_cprop = df["cumulative_counts"] / df["population"] * 100000

assert set(new_df["geo_id"].values) == set(df["fips"].values)
assert set(new_df["geo_id"].values) == set(['01000', '13000', '48027', '50103', '53003'])
assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
assert set(new_df["incidence"].values) == set(exp_incidence.values)
assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
assert set(new_df["incidence"].values) - set(exp_incidence.values) == set([np.Inf])
assert set(new_df["cumulative_prop"].values) - set(exp_cprop.values) == set([np.Inf])

def test_state(self):

Expand All @@ -95,19 +114,31 @@ def test_state(self):
"population": [100, 2100, 300, 25],
}
)

df_mega = pd.DataFrame(
{
"fips": ["90013", "90001", "04000", "25000"],
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
"new_counts": [8, 2, 5, 10],
"cumulative_counts": [80, 12, 30, 100],
"population": [np.nan, np.nan, np.nan, np.nan],
}
)

df = df.append(df_mega)

new_df = geo_map(df, "state", MAP_DF)
new_df = geo_map(df, "state", MAP_DF, 'new_counts')

exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
exp_incidence = np.array([27 + 5, 13 + 10]) / np.array([2500, 25]) * 100000
exp_cprop = np.array([165 + 30, 60 + 100]) / np.array([2500, 25]) * 100000

assert (new_df["geo_id"].values == ["az", "ma"]).all()
assert (new_df["timestamp"].values == ["2020-02-15", "2020-02-15"]).all()
assert (new_df["new_counts"].values == [27, 13]).all()
assert (new_df["cumulative_counts"].values == [165, 60]).all()
assert (new_df["population"].values == [2500, 25]).all()
assert (new_df["incidence"].values == exp_incidence).all()
assert (new_df["cumulative_prop"].values == exp_cprop).all()
assert set(new_df["geo_id"].values) == set(["az", "ma", "al", "ga"])
assert set(new_df["timestamp"].values) == set(["2020-02-15"])
assert set(new_df["new_counts"].values) == set([32, 23, 2, 8])
assert set(new_df["cumulative_counts"].values) == set([195, 160, 12, 80])
assert set(new_df["population"].values) == set([2500, 25, 0])
assert set(new_df["incidence"].values) - set(exp_incidence) == set([np.Inf])
assert set(new_df["cumulative_prop"].values) - set(exp_cprop) == set([np.Inf])

def test_hrr(self):

Expand All @@ -121,7 +152,19 @@ def test_hrr(self):
}
)

new_df = geo_map(df, "hrr", MAP_DF)
# df_mega = pd.DataFrame(
# {
# "fips": ["90013", "90001"],
# "timestamp": ["2020-02-15", "2020-02-15"],
# "new_counts": [8, 2],
# "cumulative_counts": [80, 12],
# "population": [np.nan, np.nan],
# }
# )

# df = df.append(df_mega)

new_df = geo_map(df, "hrr", MAP_DF, 'new_counts')

exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
Expand All @@ -145,8 +188,20 @@ def test_msa(self):
"population": [100, 2100, 300, 25],
}
)

new_df = geo_map(df, "msa", MAP_DF)

# df_mega = pd.DataFrame(
# {
# "fips": ["90013", "90001"],
# "timestamp": ["2020-02-15", "2020-02-15"],
# "new_counts": [8, 2],
# "cumulative_counts": [80, 12],
# "population": [np.nan, np.nan],
# }
# )

# df = df.append(df_mega)

new_df = geo_map(df, "msa", MAP_DF, 'new_counts')

exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000
Expand Down
2 changes: 1 addition & 1 deletion jhu/tests/test_smooth.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_output_files_smoothed(self, run_as_module):

smoothed = pd.read_csv(
join("receiving",
f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
f"{dates[-1]}_state_wip_confirmed_7dav_cumul_num.csv")
)

raw = pd.concat([
Expand Down

0 comments on commit 5ff04c0

Please sign in to comment.