From 47b7ea23b9882f620c391474d75061bda0adb08a Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 23 Sep 2022 17:38:13 -0400 Subject: [PATCH 01/11] Backfill population in island areas (#1882) --- .../data_pipeline/etl/score/etl_score.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index b0fc3b4d8..7ee7b18a1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -634,8 +634,24 @@ class ReversePercentile: ] ].mean(axis=1, skipna=True) + # For AS, MP, GU, and VI, backfill data from the 2010 census where we have it + df_copy = self._backfill_island_data(df_copy) + return df_copy + @staticmethod + def _backfill_island_data(df: pd.DataFrame) -> pd.DataFrame: + logger.info("Backfilling island data") + island_index = ( + df[field_names.GEOID_TRACT_FIELD] + .str[:2] + .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) + ) + df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ + island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 + ] + return df + def transform(self) -> None: logger.info("Transforming Score Data") From 97179ec5eae1641f35b7e10dd19c669accd8de7d Mon Sep 17 00:00:00 2001 From: matt bowen Date: Mon, 26 Sep 2022 10:49:59 -0400 Subject: [PATCH 02/11] Update smoketest to account for backfills (#1882) As I wrote in the commend: We backfill island areas with data from the 2010 census, so if THOSE tracts have data beyond the data source, that's to be expected and is fine to pass. If some other state or territory does though, this should fail This ends up being a nice way of documenting that behavior i guess! --- .../data_pipeline/tests/score/test_output.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index f10e6f71c..d6a5cb1a0 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -7,6 +7,7 @@ import numpy as np from data_pipeline.score import field_names from data_pipeline.score.field_names import GEOID_TRACT_FIELD +from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES from .fixtures import ( final_score_df, ejscreen_df, @@ -287,7 +288,24 @@ def test_data_sources( # Make sure we have NAs for any tracts in the final data that aren't # included in the data source - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + has_additional_non_null_tracts = not np.all( + df[df.MERGE == "left_only"][final_columns].isna() + ) + if has_additional_non_null_tracts: + # We backfill island areas with data from the 2010 census, so if THOSE tracts + # have data beyond the data source, that's to be expected and is fine to pass. + # If some other state or territory does though, this should fail + left_only = df.loc[(df.MERGE == "left_only")] + left_only_has_value = left_only.loc[ + ~df[final_columns].isna().all(axis=1) + ] + fips_with_values = set( + left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2] + ) + non_island_fips_codes = fips_with_values.difference( + TILES_ISLAND_AREA_FIPS_CODES + ) + assert not non_island_fips_codes # Make sure the datasource doesn't have a ton of unmatched tracts, implying it # has moved to 2020 tracts From 136c86693e3b20abc54306e3dc6f7abf5ba550f6 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Mon, 26 Sep 2022 10:55:02 -0400 Subject: [PATCH 03/11] Fixup lint issues (#1882) --- data/data-pipeline/data_pipeline/etl/score/constants.py | 4 +--- .../data_pipeline/tests/sources/eamlis/test_etl.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 46adab52f..410d194c4 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -381,8 +381,6 @@ field_names.PERCENT_AGE_OVER_64: "AGE_OLD", field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT", field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC", - - } # columns to round floats to 2 decimals @@ -456,5 +454,5 @@ field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, field_names.AML_BOOLEAN, field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, - field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT + field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, ] diff --git a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py index 37b15f650..09275de2e 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py @@ -156,4 +156,4 @@ def test_tract_id_lengths(self, mock_etl, mock_paths): "data_pipeline.etl.sources.eamlis.etl.add_tracts_for_geometries", new=_fake_add_tracts_for_geometries, ): - super().test_tract_id_lengths(mock_etl, mock_paths) \ No newline at end of file + super().test_tract_id_lengths(mock_etl, mock_paths) From 2ea14ba4280dd6a494c2aa4a2046ce8c843c76e2 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 27 Sep 2022 09:13:15 -0400 Subject: [PATCH 04/11] Add in race demos to 2010 census pull (#1851) --- .../etl/sources/census_decennial/etl.py | 126 ++++++++++++++++-- .../data_pipeline/score/field_names.py | 1 + 2 files changed, 119 insertions(+), 8 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index ea503f626..0ad37cab0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -1,4 +1,5 @@ import json +from typing import List import requests import numpy as np @@ -147,6 +148,65 @@ def __init__(self): field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 ) + # Race/Ethnicity fields + self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total + self.ASIAN_FIELD = "PCT086002" # Total!!Asian + self.BLACK_OR_AA_FIELD = "PCT086003" # Total!!Black or African American + self.NATIVE_HI_OR_API_FIELD = ( + "PCT086004" # Total!!Native Hawaiian and Other Pacific Islander + ) + self.WHITE_FIELD = "PCT086005" # Total!!White + self.HISPANIC_OR_LATINO_FIELD = "PCT086006" # Total!!Hispanic or Latino + self.TWO_OR_MORE_RACES_FIELD = ( + "P004024" # Total!!Two or More Ethnic Origins or RaceTotal + ) + self.OTHER_ETHNIC_ORIGIN_FIELD = ( + "PCT086007" # Total!!Other Ethnic Origin or Ra + ) + + self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total + self.BLACK_VI_FIELD = ( + "P003003" # Total!!One race!!Black or African American alone + ) + self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone + self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone + self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone + self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races + self.NON_HISPANIC_WHITE_VI_FIELD = ( + "P005006" # Total!!Not Hispanic or Latino!!One race!!White alone + ) + self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino + self.OTHER_RACE_VI_FIELD = ( + "P003008" # Total!!One race!!Some Other Race alone + ) + self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total + + self.TOTAL_RACE_POPULATION_FIELD_NAME = ( + "Total population surveyed on racial data" + ) + self.BLACK_FIELD_NAME = "Black or African American" + self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native" + self.ASIAN_FIELD_NAME = "Asian" + self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific" + self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races" + self.NON_HISPANIC_WHITE_FIELD_NAME = "White" + self.HISPANIC_FIELD_NAME = "Hispanic or Latino" + # Note that `other` is lowercase because the whole field will show up in the download + # file as "Percent other races" + self.OTHER_RACE_FIELD_NAME = "other races" + + # Name output demographics fields. + self.RE_OUTPUT_FIELDS = [ + self.BLACK_FIELD_NAME, + self.AMERICAN_INDIAN_FIELD_NAME, + self.ASIAN_FIELD_NAME, + self.HAWAIIAN_FIELD_NAME, + self.TWO_OR_MORE_RACES_FIELD_NAME, + self.NON_HISPANIC_WHITE_FIELD_NAME, + self.HISPANIC_FIELD_NAME, + self.OTHER_RACE_FIELD_NAME, + ] + var_list = [ self.MEDIAN_INCOME_FIELD, self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD, @@ -162,6 +222,14 @@ def __init__(self): self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, self.TOTAL_POP_FIELD, + self.TOTAL_RACE_POPULATION_FIELD, + self.ASIAN_FIELD, + self.TWO_OR_MORE_RACES_FIELD, + self.BLACK_OR_AA_FIELD, + self.NATIVE_HI_OR_API_FIELD, + self.WHITE_FIELD, + self.HISPANIC_OR_LATINO_FIELD, + self.OTHER_ETHNIC_ORIGIN_FIELD, ] var_list = ",".join(var_list) @@ -180,6 +248,15 @@ def __init__(self): self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD, self.TOTAL_POP_VI_FIELD, + self.BLACK_VI_FIELD, + self.AMERICAN_INDIAN_VI_FIELD, + self.ASIAN_VI_FIELD, + self.HAWAIIAN_VI_FIELD, + self.TWO_OR_MORE_RACES_VI_FIELD, + self.NON_HISPANIC_WHITE_VI_FIELD, + self.HISPANIC_VI_FIELD, + self.OTHER_RACE_VI_FIELD, + self.TOTAL_RACE_POPULATION_VI_FIELD, ] var_list_vi = ",".join(var_list_vi) @@ -210,6 +287,23 @@ def __init__(self): self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD, self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD, self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, + self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, + self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, + self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME, + self.ASIAN_FIELD: self.ASIAN_FIELD_NAME, + self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME, + self.BLACK_OR_AA_FIELD: self.BLACK_FIELD_NAME, + self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME, + self.NATIVE_HI_OR_API_FIELD: self.HAWAIIAN_FIELD_NAME, + self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME, + self.TWO_OR_MORE_RACES_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, + self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, + self.WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, + self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, + self.HISPANIC_OR_LATINO_FIELD: self.HISPANIC_FIELD_NAME, + self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME, + self.OTHER_ETHNIC_ORIGIN_FIELD: self.OTHER_RACE_FIELD_NAME, + self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME, } # To do: Ask Census Slack Group about whether you need to hardcode the county fips @@ -252,6 +346,8 @@ def __init__(self): + "&for=tract:*&in=state:{}%20county:{}" ) + self.final_race_fields: List[str] = [] + self.df: pd.DataFrame self.df_vi: pd.DataFrame self.df_all: pd.DataFrame @@ -264,14 +360,15 @@ def extract(self) -> None: f"Downloading data for state/territory {island['state_abbreviation']}" ) for county in island["county_fips"]: + api_url = self.API_URL.format( + self.DECENNIAL_YEAR, + island["state_abbreviation"], + island["var_list"], + island["fips"], + county, + ) download = requests.get( - self.API_URL.format( - self.DECENNIAL_YEAR, - island["state_abbreviation"], - island["var_list"], - island["fips"], - county, - ), + api_url, timeout=settings.REQUESTS_DEFAULT_TIMOUT, ) @@ -379,6 +476,19 @@ def transform(self) -> None: self.df_all["state"] + self.df_all["county"] + self.df_all["tract"] ) + # Calculate stats by race + for race_field_name in self.RE_OUTPUT_FIELDS: + output_field_name = ( + field_names.PERCENT_PREFIX + + race_field_name + + field_names.ISLAND_AREA_BACKFILL_SUFFIX + ) + self.final_race_fields.append(output_field_name) + self.df_all[output_field_name] = ( + self.df_all[race_field_name] + / self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME] + ) + # Reporting Missing Values for col in self.df_all.columns: missing_value_count = self.df_all[col].isnull().sum() @@ -402,7 +512,7 @@ def load(self) -> None: self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME, self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME, self.UNEMPLOYMENT_FIELD_NAME, - ] + ] + self.final_race_fields self.df_all[columns_to_include].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 570dae884..de744adec 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -3,6 +3,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas" ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)" ADJACENCY_INDEX_SUFFIX = " (average of neighbors)" +ISLAND_AREA_BACKFILL_SUFFIX = " (2010 census data backfill)" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" From a9459a61a46f7e95abe1d9cc0636bfc98aab8e21 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 27 Sep 2022 10:15:22 -0400 Subject: [PATCH 05/11] Add backfill data to score (#1851) --- .../data_pipeline/etl/score/etl_score.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 7ee7b18a1..55d0fa645 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -1,4 +1,6 @@ import functools +from typing import List + from dataclasses import dataclass import numpy as np @@ -56,6 +58,8 @@ def __init__(self): self.fuds_df: pd.DataFrame self.tribal_overlap_df: pd.DataFrame + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = [] + def extract(self) -> None: logger.info("Loading data sets from disk.") @@ -402,6 +406,25 @@ def _prepare_initial_df(self) -> pd.DataFrame: df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD] ) + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [ + field_names.PERCENT_BLACK_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_ASIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HAWAIIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HISPANIC_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_OTHER_RACE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + ] + # Donut columns get added later numeric_columns = [ field_names.HOUSING_BURDEN_FIELD, @@ -471,7 +494,7 @@ def _prepare_initial_df(self) -> pd.DataFrame: field_names.PERCENT_AGE_OVER_64, field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT, - ] + ] + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS non_numeric_columns = [ self.GEOID_TRACT_FIELD_NAME, @@ -639,14 +662,23 @@ class ReversePercentile: return df_copy - @staticmethod - def _backfill_island_data(df: pd.DataFrame) -> pd.DataFrame: + def _backfill_island_data(self, df: pd.DataFrame) -> pd.DataFrame: logger.info("Backfilling island data") island_index = ( df[field_names.GEOID_TRACT_FIELD] .str[:2] .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) ) + + for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: + actual_field_name = backfill_field_name.replace( + field_names.ISLAND_AREA_BACKFILL_SUFFIX, "" + ) + df.loc[island_index, actual_field_name] = df.loc[ + island_index, backfill_field_name + ] + df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS) + df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 ] From a9e5d6ef5c84ea3fea418cf557da5f5196786c8c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 27 Sep 2022 12:46:23 -0400 Subject: [PATCH 06/11] Change column name (#1851) --- data/data-pipeline/data_pipeline/etl/score/etl_score.py | 2 +- .../data_pipeline/etl/sources/census_decennial/etl.py | 1 - data/data-pipeline/data_pipeline/score/field_names.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 55d0fa645..3275092dc 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -658,7 +658,7 @@ class ReversePercentile: ].mean(axis=1, skipna=True) # For AS, MP, GU, and VI, backfill data from the 2010 census where we have it - df_copy = self._backfill_island_data(df_copy) + # df_copy = self._backfill_island_data(df_copy) return df_copy diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 0ad37cab0..2a6ae510c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -481,7 +481,6 @@ def transform(self) -> None: output_field_name = ( field_names.PERCENT_PREFIX + race_field_name - + field_names.ISLAND_AREA_BACKFILL_SUFFIX ) self.final_race_fields.append(output_field_name) self.df_all[output_field_name] = ( diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index de744adec..35a3514e2 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -3,7 +3,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas" ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)" ADJACENCY_INDEX_SUFFIX = " (average of neighbors)" -ISLAND_AREA_BACKFILL_SUFFIX = " (2010 census data backfill)" +ISLAND_AREA_BACKFILL_SUFFIX = "in 2009" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" From 6c049ed6c1ace4072a664c20296d4f2a6578f536 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Tue, 27 Sep 2022 16:13:24 -0400 Subject: [PATCH 07/11] Fill demos after the score (#1851) --- .../data_pipeline/etl/score/etl_score.py | 18 +++++++++--------- .../etl/sources/census_decennial/etl.py | 1 + .../data_pipeline/score/field_names.py | 2 +- .../data_pipeline/score/score_narwhal.py | 1 - 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 3275092dc..3bc08910d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -657,19 +657,19 @@ class ReversePercentile: ] ].mean(axis=1, skipna=True) - # For AS, MP, GU, and VI, backfill data from the 2010 census where we have it - # df_copy = self._backfill_island_data(df_copy) - return df_copy - def _backfill_island_data(self, df: pd.DataFrame) -> pd.DataFrame: - logger.info("Backfilling island data") - island_index = ( + @staticmethod + def _get_island_areas(df: pd.DataFrame) -> pd.Series: + return ( df[field_names.GEOID_TRACT_FIELD] .str[:2] .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) ) + def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Backfilling island demographic data") + island_index = self._get_island_areas(df) for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: actual_field_name = backfill_field_name.replace( field_names.ISLAND_AREA_BACKFILL_SUFFIX, "" @@ -679,9 +679,6 @@ def _backfill_island_data(self, df: pd.DataFrame) -> pd.DataFrame: ] df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS) - df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ - island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 - ] return df def transform(self) -> None: @@ -693,6 +690,9 @@ def transform(self) -> None: # calculate scores self.df = ScoreRunner(df=self.df).calculate_scores() + # We add island demographic data since it doesn't matter to the score anyway + self.df = self._backfill_island_demographics(self.df) + def load(self) -> None: logger.info("Saving Score CSV") constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 2a6ae510c..0ad37cab0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -481,6 +481,7 @@ def transform(self) -> None: output_field_name = ( field_names.PERCENT_PREFIX + race_field_name + + field_names.ISLAND_AREA_BACKFILL_SUFFIX ) self.final_race_fields.append(output_field_name) self.df_all[output_field_name] = ( diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 35a3514e2..80115a418 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -3,7 +3,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas" ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)" ADJACENCY_INDEX_SUFFIX = " (average of neighbors)" -ISLAND_AREA_BACKFILL_SUFFIX = "in 2009" +ISLAND_AREA_BACKFILL_SUFFIX = " in 2009" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index fd7129ffc..7e92654be 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -999,7 +999,6 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame: def add_columns(self) -> pd.DataFrame: logger.info("Adding Score Narhwal") - self.df[field_names.THRESHOLD_COUNT] = 0 self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = ( From f11252ef2ecf292ae7edbe7c5b4f6b9cb593d5fe Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 28 Sep 2022 10:24:25 -0400 Subject: [PATCH 08/11] Add income back, adjust test (#1882) --- .../data_pipeline/etl/score/etl_score.py | 4 ++++ .../data_pipeline/etl/sources/census_decennial/etl.py | 1 + .../data_pipeline/tests/score/test_output.py | 11 ++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 3bc08910d..5a865d5f7 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -679,6 +679,10 @@ def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame: ] df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS) + df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[ + island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010 + ] + return df def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 0ad37cab0..e81aebe0a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -367,6 +367,7 @@ def extract(self) -> None: island["fips"], county, ) + logger.debug(f"CENSUS: Requesting {api_url}") download = requests.get( api_url, timeout=settings.REQUESTS_DEFAULT_TIMOUT, diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index d6a5cb1a0..f53101946 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -5,6 +5,7 @@ import pytest import pandas as pd import numpy as np +from data_pipeline.etl.score import constants from data_pipeline.score import field_names from data_pipeline.score.field_names import GEOID_TRACT_FIELD from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES @@ -383,8 +384,16 @@ def test_imputed_tracts(final_score_df): ) # Make sure that no tracts with population have null imputed income + # We DO NOT impute income + is_island_area = ( + final_score_df[field_names.GEOID_TRACT_FIELD] + .str[:2] + .isin(constants.TILES_ISLAND_AREA_FIPS_CODES) + ) + tracts_with_some_population_df = final_score_df[ - final_score_df[field_names.TOTAL_POP_FIELD] > 0 + (final_score_df[field_names.TOTAL_POP_FIELD] > 0) + & ~is_island_area ] assert ( not tracts_with_some_population_df[ From b741001be6e1918273ddbc77f4431d507011d5df Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 28 Sep 2022 15:32:12 -0400 Subject: [PATCH 09/11] Apply code-review feedback (#1851) --- .../etl/sources/census_decennial/etl.py | 39 +++++++++---------- .../data_pipeline/tests/score/test_output.py | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index e81aebe0a..c23f56ed9 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -151,16 +151,16 @@ def __init__(self): # Race/Ethnicity fields self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total self.ASIAN_FIELD = "PCT086002" # Total!!Asian - self.BLACK_OR_AA_FIELD = "PCT086003" # Total!!Black or African American - self.NATIVE_HI_OR_API_FIELD = ( + self.BLACK_FIELD = "PCT086003" # Total!!Black or African American + self.HAWAIIAN_FIELD_NAME = ( "PCT086004" # Total!!Native Hawaiian and Other Pacific Islander ) - self.WHITE_FIELD = "PCT086005" # Total!!White - self.HISPANIC_OR_LATINO_FIELD = "PCT086006" # Total!!Hispanic or Latino - self.TWO_OR_MORE_RACES_FIELD = ( - "P004024" # Total!!Two or More Ethnic Origins or RaceTotal - ) - self.OTHER_ETHNIC_ORIGIN_FIELD = ( + # Note that the 2010 census for island araeas does not break out + # hispanic and non-hispanic white, so this is slightly different from + # our other demographic data + self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White + self.HISPANIC_FIELD_NAME = "PCT086006" # Total!!Hispanic or Latino + self.OTHER_RACE_FIELD = ( "PCT086007" # Total!!Other Ethnic Origin or Ra ) @@ -224,12 +224,11 @@ def __init__(self): self.TOTAL_POP_FIELD, self.TOTAL_RACE_POPULATION_FIELD, self.ASIAN_FIELD, - self.TWO_OR_MORE_RACES_FIELD, - self.BLACK_OR_AA_FIELD, - self.NATIVE_HI_OR_API_FIELD, - self.WHITE_FIELD, - self.HISPANIC_OR_LATINO_FIELD, - self.OTHER_ETHNIC_ORIGIN_FIELD, + self.BLACK_FIELD, + self.HAWAIIAN_FIELD_NAME, + self.NON_HISPANIC_WHITE_FIELD, + self.HISPANIC_FIELD_NAME, + self.OTHER_RACE_FIELD, ] var_list = ",".join(var_list) @@ -289,20 +288,20 @@ def __init__(self): self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD, self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME, + # Note there is no American Indian data for AS/GU/MI self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME, self.ASIAN_FIELD: self.ASIAN_FIELD_NAME, self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME, - self.BLACK_OR_AA_FIELD: self.BLACK_FIELD_NAME, + self.BLACK_FIELD: self.BLACK_FIELD_NAME, self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME, - self.NATIVE_HI_OR_API_FIELD: self.HAWAIIAN_FIELD_NAME, + self.HAWAIIAN_FIELD_NAME: self.HAWAIIAN_FIELD_NAME, self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME, - self.TWO_OR_MORE_RACES_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, - self.WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, + self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, - self.HISPANIC_OR_LATINO_FIELD: self.HISPANIC_FIELD_NAME, + self.HISPANIC_FIELD_NAME: self.HISPANIC_FIELD_NAME, self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME, - self.OTHER_ETHNIC_ORIGIN_FIELD: self.OTHER_RACE_FIELD_NAME, + self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME, self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME, } diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index f53101946..a2d10e467 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -384,7 +384,7 @@ def test_imputed_tracts(final_score_df): ) # Make sure that no tracts with population have null imputed income - # We DO NOT impute income + # We DO NOT impute income for island areas, so remove those from the test is_island_area = ( final_score_df[field_names.GEOID_TRACT_FIELD] .str[:2] From 403c55a0ec8cc423963506c881a63a806379038a Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 28 Sep 2022 16:42:04 -0400 Subject: [PATCH 10/11] Add test for island area backfill (#1851) --- .../data_pipeline/tests/score/test_output.py | 76 ++++++++++++++++++- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index a2d10e467..75fb144a8 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -268,7 +268,7 @@ def test_data_sources( # is the "equal" to the data from the ETL, allowing for the minor # differences that come from floating point comparisons for data_source_name, data_source in data_sources.items(): - final = "final_" + final = "_final" df: pd.DataFrame = final_score_df.merge( data_source, on=GEOID_TRACT_FIELD, @@ -342,6 +342,77 @@ def test_data_sources( ), error_message +def test_island_demographic_backfill(final_score_df, census_decennial_df): + # Copied from score_etl because there's no better source of truth for it + ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [ + field_names.PERCENT_BLACK_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_ASIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HAWAIIAN_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_HISPANIC_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.PERCENT_OTHER_RACE_FIELD_NAME + + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + field_names.TOTAL_POP_FIELD + field_names.ISLAND_AREA_BACKFILL_SUFFIX, + ] + + # rename the columns from the decennial census to be their final score names + decennial_cols = { + col_name: col_name.replace(field_names.ISLAND_AREA_BACKFILL_SUFFIX, "") + for col_name in ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS + } + census_decennial_df: pd.DataFrame = census_decennial_df.rename( + columns=decennial_cols + ) + + # Merge decennial data with the final score + df: pd.DataFrame = final_score_df.merge( + census_decennial_df, + on=GEOID_TRACT_FIELD, + indicator="MERGE", + suffixes=("_final", "_decennial"), + how="outer", + ) + + # Make sure columns from both the decennial census and final score overlap + core_cols = census_decennial_df.columns.intersection( + final_score_df.columns + ).drop(GEOID_TRACT_FIELD) + final_columns = [f"{col}_final" for col in core_cols] + assert ( + final_columns + ), "No columns from decennial census show up in final score, extremely weird" + + # Make sure we're only grabbing island tracts for the decennial data + assert ( + sorted( + df[df.MERGE == "both"][field_names.GEOID_TRACT_FIELD] + .str[:2] + .unique() + ) + == constants.TILES_ISLAND_AREA_FIPS_CODES + ), "2010 Decennial census contributed unexpected tracts" + + df = df[df.MERGE == "both"] + + # Make sure for all the backfill tracts, the data made it into the + # final score. This can be simple since it's all perenctages and an int + for col in final_columns: + assert np.allclose( + df[col], + df[col.replace("_final", "_decennial")], + equal_nan=True, + ), f"Data mismatch in decennial census backfill for {col}" + + def test_output_tracts(final_score_df, national_tract_df): df = final_score_df.merge( national_tract_df, @@ -392,8 +463,7 @@ def test_imputed_tracts(final_score_df): ) tracts_with_some_population_df = final_score_df[ - (final_score_df[field_names.TOTAL_POP_FIELD] > 0) - & ~is_island_area + (final_score_df[field_names.TOTAL_POP_FIELD] > 0) & ~is_island_area ] assert ( not tracts_with_some_population_df[ From 95a4d8ea3f7cbf2e4687eeba9fac9f953e0e86e0 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 28 Sep 2022 19:10:08 -0400 Subject: [PATCH 11/11] Fix bad rename (#1851) --- .../etl/sources/census_decennial/etl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index c23f56ed9..0954f8e83 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -152,14 +152,14 @@ def __init__(self): self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total self.ASIAN_FIELD = "PCT086002" # Total!!Asian self.BLACK_FIELD = "PCT086003" # Total!!Black or African American - self.HAWAIIAN_FIELD_NAME = ( + self.HAWAIIAN_FIELD = ( "PCT086004" # Total!!Native Hawaiian and Other Pacific Islander ) # Note that the 2010 census for island araeas does not break out # hispanic and non-hispanic white, so this is slightly different from # our other demographic data self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White - self.HISPANIC_FIELD_NAME = "PCT086006" # Total!!Hispanic or Latino + self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino self.OTHER_RACE_FIELD = ( "PCT086007" # Total!!Other Ethnic Origin or Ra ) @@ -225,9 +225,9 @@ def __init__(self): self.TOTAL_RACE_POPULATION_FIELD, self.ASIAN_FIELD, self.BLACK_FIELD, - self.HAWAIIAN_FIELD_NAME, + self.HAWAIIAN_FIELD, self.NON_HISPANIC_WHITE_FIELD, - self.HISPANIC_FIELD_NAME, + self.HISPANIC_FIELD, self.OTHER_RACE_FIELD, ] var_list = ",".join(var_list) @@ -294,12 +294,12 @@ def __init__(self): self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME, self.BLACK_FIELD: self.BLACK_FIELD_NAME, self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME, - self.HAWAIIAN_FIELD_NAME: self.HAWAIIAN_FIELD_NAME, + self.HAWAIIAN_FIELD: self.HAWAIIAN_FIELD_NAME, self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME, self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME, self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME, - self.HISPANIC_FIELD_NAME: self.HISPANIC_FIELD_NAME, + self.HISPANIC_FIELD: self.HISPANIC_FIELD_NAME, self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME, self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME, self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME,