diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 46adab52f..410d194c4 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -381,8 +381,6 @@ field_names.PERCENT_AGE_OVER_64: "AGE_OLD", field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT", field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC", - - } # columns to round floats to 2 decimals @@ -456,5 +454,5 @@ field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, field_names.AML_BOOLEAN, field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, - field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT + field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index b0fc3b4d8..7ee7b18a1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -634,8 +634,24 @@ class ReversePercentile: ] ].mean(axis=1, skipna=True) + # For AS, MP, GU, and VI, backfill data from the 2010 census where we have it + df_copy = self._backfill_island_data(df_copy) + return df_copy + @staticmethod + def _backfill_island_data(df: pd.DataFrame) -> pd.DataFrame: + logger.info("Backfilling island data") + island_index = ( + df[field_names.GEOID_TRACT_FIELD] + .str[:2] + .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) + ) + df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ + island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 + ] + return df + def transform(self) -> None: logger.info("Transforming Score Data") diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index f10e6f71c..d6a5cb1a0 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -7,6 +7,7 @@ import numpy as np from data_pipeline.score import field_names from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES from .fixtures import ( final_score_df, ejscreen_df, @@ -287,7 +288,24 @@ def test_data_sources( # Make sure we have NAs for any tracts in the final data that aren't # included in the data source - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + has_additional_non_null_tracts = not np.all( + df[df.MERGE == "left_only"][final_columns].isna() + ) + if has_additional_non_null_tracts: + # We backfill island areas with data from the 2010 census, so if THOSE tracts + # have data beyond the data source, that's to be expected and is fine to pass. + # If some other state or territory does though, this should fail + left_only = df.loc[(df.MERGE == "left_only")] + left_only_has_value = left_only.loc[ + ~df[final_columns].isna().all(axis=1) + ] + fips_with_values = set( + left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2] + ) + non_island_fips_codes = fips_with_values.difference( + TILES_ISLAND_AREA_FIPS_CODES + ) + assert not non_island_fips_codes # Make sure the datasource doesn't have a ton of unmatched tracts, implying it # has moved to 2020 tracts diff --git a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py index 37b15f650..09275de2e 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py @@ -156,4 +156,4 @@ def test_tract_id_lengths(self, mock_etl, mock_paths): "data_pipeline.etl.sources.eamlis.etl.add_tracts_for_geometries", new=_fake_add_tracts_for_geometries, ): - super().test_tract_id_lengths(mock_etl, mock_paths) \ No newline at end of file + super().test_tract_id_lengths(mock_etl, mock_paths)