cmu-delphi · krivard · Oct 30, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 16, 2020
diff --git a/usafacts/delphi_usafacts/geo.py b/usafacts/delphi_usafacts/geo.py
@@ -1,63 +1,10 @@
 # -*- coding: utf-8 -*-
 import pandas as pd
 
+from delphi_utils import GeoMapper
 
 INCIDENCE_BASE = 100000
-# https://code.activestate.com/recipes/577775-state-fips-codes-dict/
-STATE_TO_FIPS = {
-    "WA": "53",
-    "DE": "10",
-    "DC": "11",
-    "WI": "55",
-    "WV": "54",
-    "HI": "15",
-    "FL": "12",
-    "WY": "56",
-    "PR": "72",
-    "NJ": "34",
-    "NM": "35",
-    "TX": "48",
-    "LA": "22",
-    "NC": "37",
-    "ND": "38",
-    "NE": "31",
-    "TN": "47",
-    "NY": "36",
-    "PA": "42",
-    "AK": "02",
-    "NV": "32",
-    "NH": "33",
-    "VA": "51",
-    "CO": "08",
-    "CA": "06",
-    "AL": "01",
-    "AR": "05",
-    "VT": "50",
-    "IL": "17",
-    "GA": "13",
-    "IN": "18",
-    "IA": "19",
-    "MA": "25",
-    "AZ": "04",
-    "ID": "16",
-    "CT": "09",
-    "ME": "23",
-    "MD": "24",
-    "OK": "40",
-    "OH": "39",
-    "UT": "49",
-    "MO": "29",
-    "MN": "27",
-    "MI": "26",
-    "RI": "44",
-    "KS": "20",
-    "MT": "30",
-    "MS": "28",
-    "SC": "45",
-    "KY": "21",
-    "OR": "41",
-    "SD": "46",
-}
+
 SECONDARY_FIPS = [
     ("51620", ["51093", "51175"]),
     ("51685", ["51153"]),
@@ -76,32 +23,11 @@
     ("46102", "46113"),
 ]
 
-FIPS_TO_STATE = {v: k.lower() for k, v in STATE_TO_FIPS.items()}
-
-
-def fips_to_state(fips: str) -> str:
-    """Wrapper that handles exceptions to the FIPS scheme in the USAFacts data.
-
-    All the county FIPS codes are mapped to state by taking the first two
-    digits of the five digit, zero-padded county FIPS and applying
-    FIPS_TO_STATE to map it to the two-letter postal abbreviation.
-
-    Parameters
-    ----------
-    fips: str
-        Five digit, zero padded county FIPS code
-
-    Returns
-    -------
-    str
-        Two-letter postal abbreviation, lower case.
-
-    Raises
-    ------
-    KeyError
-        Inputted FIPS code not recognized.
-    """
-    return FIPS_TO_STATE[fips[:2]]
+# Valid geographical resolutions output by this indicator.
+VALID_GEO_RES = ("county", "state", "msa", "hrr")
+# Sensors that report proportions.  For geo resolutions with unallocated cases
+# or deaths, we avoid reporting these sensors.
+PROP_SENSORS = ("incidence", "cumulative_prop")
 
 
 def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
@@ -121,9 +47,9 @@ def disburse(df: pd.DataFrame, pooled_fips: str, fips_list: list):
     pd.DataFrame
         Dataframe with same schema as df, with the counts disbursed.
     """
-    COLS = ["new_counts", "cumulative_counts"]
+    cols = ["new_counts", "cumulative_counts"]
     df = df.copy().sort_values(["fips", "timestamp"])
-    for col in COLS:
+    for col in cols:
         # Get values from the aggregated county:
         vals = df.loc[df["fips"] == pooled_fips, col].values / len(fips_list)
         for fips in fips_list:
@@ -142,7 +68,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         Columns: fips, timestamp, new_counts, cumulative_counts, population ...
     geo_res: str
         Geographic resolution to which to aggregate.  Valid options:
-        ('county', 'state', 'msa', 'hrr').
+        ("county", "state", "msa", "hrr").
     map_df: pd.DataFrame
         Loaded from static file "fips_prop_pop.csv".
     sensor: str
@@ -155,29 +81,31 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
     pd.DataFrame
         Columns: geo_id, timestamp, ...
     """
-    VALID_GEO_RES = ("county", "state", "msa", "hrr")
-    #It is not clear how to calculate the proportion for unallocated cases/deaths
-    PROP_SENSORS = ("incidence", "cumulative_prop")
     if geo_res not in VALID_GEO_RES:
         raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
 
-    df_mega = df[df['fips'].astype(int) % 1000 == 0].copy()
+    # State-level records unassigned to specific counties are coded as fake
+    # counties with fips XX000.
+    unassigned_counties = df[df["fips"].str.endswith("000")].copy()
 
-    df = df[df['fips'].astype(int) % 1000 != 0].copy()
+    df = df[df["fips"].astype(int) % 1000 != 0].copy()
     # Disburse unallocated cases/deaths in NYC to NYC counties
     df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1])
-    df = df[df['fips'] != NYC_FIPS[0][0]]
+    df = df[df["fips"] != NYC_FIPS[0][0]]
 
     if geo_res == "county":
         if sensor not in PROP_SENSORS:
-            df = df.append(df_mega)
+            # It is not clear how to calculate the proportion for unallocated
+            # cases/deaths, so we exclude them for those sensors.
+            df = df.append(unassigned_counties)
         df["geo_id"] = df["fips"]
     elif geo_res == "state":
         # Grab first two digits of fips
         # Map state fips to us postal code
         # Add unallocated cases/deaths
-        df = df.append(df_mega)
-        df["geo_id"] = df["fips"].apply(fips_to_state)
+        df = df.append(unassigned_counties)
+        geo_mapper = GeoMapper()
+        df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id")
     elif geo_res in ("msa", "hrr"):
         # Map "missing" secondary FIPS to those that are in our canonical set
         for fips, fips_list in SECONDARY_FIPS:
@@ -189,12 +117,14 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str):
         map_df["geo_id"] = map_df[colname].astype(int)
         df["fips"] = df["fips"].astype(int)
         merged = pd.merge(df, map_df, on="fips")
-        merged["cumulative_counts"] = merged["cumulative_counts"] * merged["pop_prop"]
+        merged["cumulative_counts"] =\
+             merged["cumulative_counts"] * merged["pop_prop"]
         merged["new_counts"] = merged["new_counts"] * merged["pop_prop"]
         merged["population"] = merged["population"] * merged["pop_prop"]
         df = merged.drop(["zip", "pop_prop", "hrrnum", "cbsa_id"], axis=1)
     df = df.drop("fips", axis=1)
     df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
     df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
-    df["cumulative_prop"] = df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
+    df["cumulative_prop"] =\
+         df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
     return df
diff --git a/usafacts/tests/conftest.py b/usafacts/tests/conftest.py
@@ -11,7 +11,9 @@
 @pytest.fixture(scope="session")
 def run_as_module():
     # Clean receiving directory
-    for fname in listdir("receiving"):
-        remove(join("receiving", fname))
+    for fname in listdir("../receiving"):
+        if fname[0] == ".":
+            continue
+        remove(join("../receiving", fname))
 
     run_module()
diff --git a/usafacts/tests/test_geo.py b/usafacts/tests/test_geo.py
@@ -1,31 +1,22 @@
-import pytest
-
 from os.path import join
 
+import pytest
+
 import numpy as np
 import pandas as pd
-from delphi_usafacts.geo import fips_to_state, disburse, geo_map
+from delphi_usafacts.geo import disburse, geo_map
 
 MAP_DF = pd.read_csv(
     join("..", "static", "fips_prop_pop.csv"),
     dtype={"fips": int}
 )
 
-sensor = "new_counts"
-class TestFipsToState:
-
-    def test_normal(self):
-
-        assert fips_to_state("53003") == "wa"
-        assert fips_to_state("48027") == "tx"
-        assert fips_to_state("12003") == "fl"
-        assert fips_to_state("50103") == "vt"
-        assert fips_to_state("15003") == "hi"
-
+SENSOR = "new_counts"
 
 class TestDisburse:
+    """Tests for the `geo.disburse()` function."""
     def test_even(self):
-
+        """Tests that values are disbursed evenly across recipients."""
         df = pd.DataFrame(
             {
                 "fips": ["51093", "51175", "51620"],
@@ -43,8 +34,9 @@ def test_even(self):
 
 
 class TestGeoMap:
+    """Tests for `geo.geo_map()`."""
     def test_incorrect_geo(self):
-
+        """Tests that an invalid resolution raises an error."""
         df = pd.DataFrame(
             {
                 "fips": ["53003", "48027", "50103"],
@@ -56,10 +48,10 @@ def test_incorrect_geo(self):
         )
 
         with pytest.raises(ValueError):
-            geo_map(df, "département", MAP_DF, sensor)
+            geo_map(df, "département", MAP_DF, SENSOR)
 
     def test_county(self):
-
+        """Tests that values are correctly aggregated at the county level."""
         df = pd.DataFrame(
             {
                 "fips": ["53003", "48027", "50103"],
@@ -70,7 +62,7 @@ def test_county(self):
             }
         )
 
-        new_df = geo_map(df, "county", MAP_DF, sensor)
+        new_df = geo_map(df, "county", MAP_DF, SENSOR)
 
         exp_incidence = df["new_counts"] / df["population"] * 100000
         exp_cprop = df["cumulative_counts"] / df["population"] * 100000
@@ -81,7 +73,7 @@ def test_county(self):
         assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
 
     def test_state(self):
-
+        """Tests that values are correctly aggregated at the state level."""
         df = pd.DataFrame(
             {
                 "fips": ["04001", "04003", "04009", "25023"],
@@ -92,7 +84,7 @@ def test_state(self):
             }
         )
 
-        new_df = geo_map(df, "state", MAP_DF, sensor)
+        new_df = geo_map(df, "state", MAP_DF, SENSOR)
 
         exp_incidence = np.array([27, 13]) / np.array([2500, 25]) * 100000
         exp_cprop = np.array([165, 60]) / np.array([2500, 25]) * 100000
@@ -106,7 +98,7 @@ def test_state(self):
         assert (new_df["cumulative_prop"].values == exp_cprop).all()
 
     def test_hrr(self):
-
+        """Tests that values are correctly aggregated at the HRR level."""
         df = pd.DataFrame(
             {
                 "fips": ["13009", "13017", "13021", "09015"],
@@ -117,7 +109,7 @@ def test_hrr(self):
             }
         )
 
-        new_df = geo_map(df, "hrr", MAP_DF, sensor)
+        new_df = geo_map(df, "hrr", MAP_DF, SENSOR)
 
         exp_incidence = np.array([13, 27]) / np.array([25, 2500]) * 100000
         exp_cprop = np.array([60, 165]) / np.array([25, 2500]) * 100000
@@ -131,7 +123,7 @@ def test_hrr(self):
         assert new_df["cumulative_prop"].values == pytest.approx(exp_cprop)
 
     def test_msa(self):
-
+        """Tests that values are correctly aggregated at the MSA level."""
         df = pd.DataFrame(
             {
                 "fips": ["13009", "13017", "13021", "09015"],
@@ -142,7 +134,7 @@ def test_msa(self):
             }
         )
 
-        new_df = geo_map(df, "msa", MAP_DF, sensor)
+        new_df = geo_map(df, "msa", MAP_DF, SENSOR)
 
         exp_incidence = np.array([2, 13]) / np.array([300, 25]) * 100000
         exp_cprop = np.array([45, 60]) / np.array([300, 25]) * 100000

diff --git a/usafacts/tests/test_run.py b/usafacts/tests/test_run.py
@@ -10,7 +10,7 @@
 class TestRun:
     def test_output_files_exist(self, run_as_module):
 
-        csv_files = listdir("receiving")
+        csv_files = listdir("../receiving")
 
         dates = [
             "20200229",
@@ -48,6 +48,6 @@ def test_output_files_exist(self, run_as_module):
     def test_output_file_format(self, run_as_module):
 
         df = pd.read_csv(
-            join("receiving", "20200310_state_confirmed_cumulative_num.csv")
+            join("../receiving", "20200310_state_confirmed_cumulative_num.csv")
         )
         assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
diff --git a/usafacts/tests/test_smooth.py b/usafacts/tests/test_smooth.py
@@ -13,13 +13,13 @@ def test_output_files_smoothed(self, run_as_module):
         dates = [str(x) for x in range(20200304, 20200311)]
 
         smoothed = pd.read_csv(
-            join("receiving",
+            join("../receiving",
                 f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
         )
 
         raw = pd.concat([
             pd.read_csv(
-                join("receiving",
+                join("../receiving",
                     f"{date}_state_confirmed_cumulative_num.csv")
             ) for date in dates
         ])