From ae52d0d816553ff786e7a951348dc001dabf07cf Mon Sep 17 00:00:00 2001
From: matt bowen <matthew.r.bowen@omb.eop.gov>
Date: Mon, 29 Aug 2022 17:32:43 -0400
Subject: [PATCH] Drop pre-2010 rows before computing score (#1848)

Note this is probably NOT the optimal place for this change; it might
make more sense for each source to filter its own tracts down to the
acceptable tract list. However, that would be a pretty invasive change,
where this is central and plenty of other things are happening in score
transform that could be moved to sources, so for today, here's where the
change will live.
---
 .../data_pipeline/etl/score/etl_score.py      | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index cfcd123dc..f0827522e 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -45,6 +45,7 @@ def __init__(self):
         self.persistent_poverty_df: pd.DataFrame
         self.census_decennial_df: pd.DataFrame
         self.census_2010_df: pd.DataFrame
+        self.national_tract_df: pd.DataFrame
         self.child_opportunity_index_df: pd.DataFrame
         self.hrs_df: pd.DataFrame
         self.dot_travel_disadvantage_df: pd.DataFrame
@@ -216,6 +217,15 @@ def extract(self) -> None:
             low_memory=False,
         )
 
+        national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH
+        self.national_tract_df = pd.read_csv(
+            national_tract_csv,
+            names=[self.GEOID_TRACT_FIELD_NAME],
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+            header=None,
+        )
+
     def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
         logger.info("Joining Census Tract dataframes")
 
@@ -384,6 +394,21 @@ def _prepare_initial_df(self) -> pd.DataFrame:
 
         census_tract_df = self._join_tract_dfs(census_tract_dfs)
 
+        # Drop tracts that don't exist in the 2010 tracts
+        tracts_to_drop_count = len(
+            set(census_tract_df.GEOID10_TRACT)
+            - set(self.national_tract_df.GEOID10_TRACT)
+
+        logger.info(
+            "Dropping %s tracts not in the 2010 tract data",
+            tracts_to_drop_count,
+        )
+        census_tract_df = census_tract_df.loc[
+            census_tract_df.GEOID10_TRACT.isin(
+                self.national_tract_df.GEOID10_TRACT
+            )
+        ]
+
         # If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
         # and then we get too many CBG rows (one for 012345 and one for 12345).