Switch from filter to inner join (#1848)

usds · Aug 30, 2022 · 1c3389a · 1c3389a
1 parent 2180efa
commit 1c3389a
Showing 1 changed file with 10 additions and 13 deletions.
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -395,21 +395,18 @@ def _prepare_initial_df(self) -> pd.DataFrame:
         census_tract_df = self._join_tract_dfs(census_tract_dfs)
 
         # Drop tracts that don't exist in the 2010 tracts
-        tracts_to_drop_count = len(
-            set(census_tract_df.GEOID10_TRACT)
-            - set(self.national_tract_df.GEOID10_TRACT)
-        )
-        logger.info(
-            "Dropping %s tracts not in the 2010 tract data",
-            tracts_to_drop_count,
+        pre_join_len = census_tract_df.shape[0] 
+
+        census_tract_df = census_tract_df.merge(
+            self.national_tract_df,
+            on="GEOID10_TRACT",
+            how="inner",
         )
-        census_tract_df = census_tract_df.loc[
-            census_tract_df.GEOID10_TRACT.isin(
-                self.national_tract_df.GEOID10_TRACT
-            )
-        ]
 
-        # If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
+        logger.info(
+            "Dropped %s tracts not in the 2010 tract data",
+            pre_join_len - census_tract_df.shape[0],
+        )        # If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
         # and then we get too many CBG rows (one for 012345 and one for 12345).
 
         # Now sanity-check the merged df.