From 88b1c792520a3bac4dbf28b10af7e127f26ad661 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 27 Sep 2022 16:13:24 -0400 Subject: [PATCH] Fill demos after the score (#1851) --- .../data_pipeline/etl/score/etl_score.py | 18 +++++++++--------- .../etl/sources/census_decennial/etl.py | 1 + .../data_pipeline/score/field_names.py | 2 +- .../data_pipeline/score/score_narwhal.py | 1 - 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 3275092dc..3bc08910d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -657,19 +657,19 @@ class ReversePercentile: ] ].mean(axis=1, skipna=True) - # For AS, MP, GU, and VI, backfill data from the 2010 census where we have it - # df_copy = self._backfill_island_data(df_copy) - return df_copy - def _backfill_island_data(self, df: pd.DataFrame) -> pd.DataFrame: - logger.info("Backfilling island data") - island_index = ( + @staticmethod + def _get_island_areas(df: pd.DataFrame) -> pd.Series: + return ( df[field_names.GEOID_TRACT_FIELD] .str[:2] .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) ) + def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Backfilling island demographic data") + island_index = self._get_island_areas(df) for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: actual_field_name = backfill_field_name.replace( field_names.ISLAND_AREA_BACKFILL_SUFFIX, "" @@ -679,9 +679,6 @@ def _backfill_island_data(self, df: pd.DataFrame) -> pd.DataFrame: ] df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS) - df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ - island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 - ] return df def transform(self) -> None: @@ -693,6 +690,9 @@ def transform(self) -> None: # calculate scores self.df = ScoreRunner(df=self.df).calculate_scores() + # We add island demographic data since it doesn't matter to the score anyway + self.df = self._backfill_island_demographics(self.df) + def load(self) -> None: logger.info("Saving Score CSV") constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 2a6ae510c..0ad37cab0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -481,6 +481,7 @@ def transform(self) -> None: output_field_name = ( field_names.PERCENT_PREFIX + race_field_name + + field_names.ISLAND_AREA_BACKFILL_SUFFIX ) self.final_race_fields.append(output_field_name) self.df_all[output_field_name] = ( diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 35a3514e2..80115a418 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -3,7 +3,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas" ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)" ADJACENCY_INDEX_SUFFIX = " (average of neighbors)" -ISLAND_AREA_BACKFILL_SUFFIX = "in 2009" +ISLAND_AREA_BACKFILL_SUFFIX = " in 2009" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index fd7129ffc..7e92654be 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -999,7 +999,6 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame: def add_columns(self) -> pd.DataFrame: logger.info("Adding Score Narhwal") - self.df[field_names.THRESHOLD_COUNT] = 0 self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (