Skip to content

Commit

Permalink
Switch from filter to inner join (#1848)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Aug 30, 2022
1 parent 2180efa commit 1c3389a
Showing 1 changed file with 10 additions and 13 deletions.
23 changes: 10 additions & 13 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,21 +395,18 @@ def _prepare_initial_df(self) -> pd.DataFrame:
census_tract_df = self._join_tract_dfs(census_tract_dfs)

# Drop tracts that don't exist in the 2010 tracts
tracts_to_drop_count = len(
set(census_tract_df.GEOID10_TRACT)
- set(self.national_tract_df.GEOID10_TRACT)
)
logger.info(
"Dropping %s tracts not in the 2010 tract data",
tracts_to_drop_count,
pre_join_len = census_tract_df.shape[0]

census_tract_df = census_tract_df.merge(
self.national_tract_df,
on="GEOID10_TRACT",
how="inner",
)
census_tract_df = census_tract_df.loc[
census_tract_df.GEOID10_TRACT.isin(
self.national_tract_df.GEOID10_TRACT
)
]

# If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
logger.info(
"Dropped %s tracts not in the 2010 tract data",
pre_join_len - census_tract_df.shape[0],
) # If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
# and then we get too many CBG rows (one for 012345 and one for 12345).

# Now sanity-check the merged df.
Expand Down

0 comments on commit 1c3389a

Please sign in to comment.