Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backfill population in island areas (#1882) #1923

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions data/data-pipeline/data_pipeline/etl/score/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,6 @@
field_names.PERCENT_AGE_OVER_64: "AGE_OLD",
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT",
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC",


}

# columns to round floats to 2 decimals
Expand Down Expand Up @@ -456,5 +454,5 @@
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
]
16 changes: 16 additions & 0 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,8 +634,24 @@ class ReversePercentile:
]
].mean(axis=1, skipna=True)

# For AS, MP, GU, and VI, backfill data from the 2010 census where we have it
df_copy = self._backfill_island_data(df_copy)

return df_copy

@staticmethod
def _backfill_island_data(df: pd.DataFrame) -> pd.DataFrame:
logger.info("Backfilling island data")
island_index = (
df[field_names.GEOID_TRACT_FIELD]
.str[:2]
.isin(constants.TILES_ISLAND_AREA_FIPS_CODES)
)
df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[
island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010
]
return df

def transform(self) -> None:
logger.info("Transforming Score Data")

Expand Down
20 changes: 19 additions & 1 deletion data/data-pipeline/data_pipeline/tests/score/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
from data_pipeline.score import field_names
from data_pipeline.score.field_names import GEOID_TRACT_FIELD
from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
from .fixtures import (
final_score_df,
ejscreen_df,
Expand Down Expand Up @@ -287,7 +288,24 @@ def test_data_sources(

# Make sure we have NAs for any tracts in the final data that aren't
# included in the data source
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
has_additional_non_null_tracts = not np.all(
df[df.MERGE == "left_only"][final_columns].isna()
)
if has_additional_non_null_tracts:
# We backfill island areas with data from the 2010 census, so if THOSE tracts
# have data beyond the data source, that's to be expected and is fine to pass.
# If some other state or territory does though, this should fail
left_only = df.loc[(df.MERGE == "left_only")]
left_only_has_value = left_only.loc[
~df[final_columns].isna().all(axis=1)
]
fips_with_values = set(
left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2]
)
non_island_fips_codes = fips_with_values.difference(
TILES_ISLAND_AREA_FIPS_CODES
)
assert not non_island_fips_codes

# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
# has moved to 2020 tracts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def test_tract_id_lengths(self, mock_etl, mock_paths):
"data_pipeline.etl.sources.eamlis.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
super().test_tract_id_lengths(mock_etl, mock_paths)
super().test_tract_id_lengths(mock_etl, mock_paths)