Skip to content

Commit

Permalink
Drop pre-2010 rows before computing score (#1848)
Browse files Browse the repository at this point in the history
Note this is probably NOT the optimal place for this change; it might
make more sense for each source to filter its own tracts down to the
acceptable tract list. However, that would be a pretty invasive change,
where this is central and plenty of other things are happening in score
transform that could be moved to sources, so for today, here's where the
change will live.
  • Loading branch information
mattbowen-usds committed Aug 31, 2022
1 parent ac09561 commit 3b6391f
Showing 1 changed file with 25 additions and 0 deletions.
25 changes: 25 additions & 0 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(self):
self.persistent_poverty_df: pd.DataFrame
self.census_decennial_df: pd.DataFrame
self.census_2010_df: pd.DataFrame
self.national_tract_df: pd.DataFrame
self.child_opportunity_index_df: pd.DataFrame
self.hrs_df: pd.DataFrame
self.dot_travel_disadvantage_df: pd.DataFrame
Expand Down Expand Up @@ -216,6 +217,15 @@ def extract(self) -> None:
low_memory=False,
)

national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH
self.national_tract_df = pd.read_csv(
national_tract_csv,
names=[self.GEOID_TRACT_FIELD_NAME],
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
header=None,
)

def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Tract dataframes")

Expand Down Expand Up @@ -384,6 +394,21 @@ def _prepare_initial_df(self) -> pd.DataFrame:

census_tract_df = self._join_tract_dfs(census_tract_dfs)

# Drop tracts that don't exist in the 2010 tracts
tracts_to_drop_count = len(
set(census_tract_df.GEOID10_TRACT)
- set(self.national_tract_df.GEOID10_TRACT)

logger.info(
"Dropping %s tracts not in the 2010 tract data",
tracts_to_drop_count,
)
census_tract_df = census_tract_df.loc[
census_tract_df.GEOID10_TRACT.isin(
self.national_tract_df.GEOID10_TRACT
)
]

# If GEOID10s are read as numbers instead of strings, the initial 0 is dropped,
# and then we get too many CBG rows (one for 012345 and one for 12345).

Expand Down

0 comments on commit 3b6391f

Please sign in to comment.