Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add demos for island areas #1932

4 changes: 1 addition & 3 deletions data/data-pipeline/data_pipeline/etl/score/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,6 @@
field_names.PERCENT_AGE_OVER_64: "AGE_OLD",
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT",
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC",


}

# columns to round floats to 2 decimals
Expand Down Expand Up @@ -456,5 +454,5 @@
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
]
54 changes: 53 additions & 1 deletion data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import functools
from typing import List

from dataclasses import dataclass

import numpy as np
Expand Down Expand Up @@ -56,6 +58,8 @@ def __init__(self):
self.fuds_df: pd.DataFrame
self.tribal_overlap_df: pd.DataFrame

self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

def extract(self) -> None:
logger.info("Loading data sets from disk.")

Expand Down Expand Up @@ -402,6 +406,25 @@ def _prepare_initial_df(self) -> pd.DataFrame:
df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD]
)

self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [
field_names.PERCENT_BLACK_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_ASIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_HAWAIIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_HISPANIC_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_OTHER_RACE_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
]

# Donut columns get added later
numeric_columns = [
field_names.HOUSING_BURDEN_FIELD,
Expand Down Expand Up @@ -471,7 +494,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
field_names.PERCENT_AGE_OVER_64,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
]
] + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS

non_numeric_columns = [
self.GEOID_TRACT_FIELD_NAME,
Expand Down Expand Up @@ -636,6 +659,32 @@ class ReversePercentile:

return df_copy

@staticmethod
def _get_island_areas(df: pd.DataFrame) -> pd.Series:
return (
df[field_names.GEOID_TRACT_FIELD]
.str[:2]
.isin(constants.TILES_ISLAND_AREA_FIPS_CODES)
)

def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worth adding a test for this? it seems like it might be straightforward to test and would add a lot of confidence.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call --- done in 403c55a

It is an ugly test, but it is a test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I was thinking a test directly of _backfill_island_demographics (just give it a simple input DF and test the output DF), but your test works!

logger.info("Backfilling island demographic data")
island_index = self._get_island_areas(df)
for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
actual_field_name = backfill_field_name.replace(
field_names.ISLAND_AREA_BACKFILL_SUFFIX, ""
)
df.loc[island_index, actual_field_name] = df.loc[
island_index, backfill_field_name
]
df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS)

df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[
island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010
]

return df

def transform(self) -> None:
logger.info("Transforming Score Data")

Expand All @@ -645,6 +694,9 @@ def transform(self) -> None:
# calculate scores
self.df = ScoreRunner(df=self.df).calculate_scores()

# We add island demographic data since it doesn't matter to the score anyway
self.df = self._backfill_island_demographics(self.df)

def load(self) -> None:
logger.info("Saving Score CSV")
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
Expand Down
127 changes: 119 additions & 8 deletions data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import List
import requests

import numpy as np
Expand Down Expand Up @@ -147,6 +148,65 @@ def __init__(self):
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
)

# Race/Ethnicity fields
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lucasmbrown-usds I'd love your feedback on how I mapped these various variables back to the main results. I tried to follow similar logic as was in the ACS ETL, and also it's subjective so I'm happy to make any changes.

self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total
self.ASIAN_FIELD = "PCT086002" # Total!!Asian
self.BLACK_OR_AA_FIELD = "PCT086003" # Total!!Black or African American
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.NATIVE_HI_OR_API_FIELD = (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment on NATIVE_HI_OR_API consistency with other fields.

"PCT086004" # Total!!Native Hawaiian and Other Pacific Islander
)
self.WHITE_FIELD = "PCT086005" # Total!!White
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment on WHITE vs NON_HISPANIC_WHITE.

Also: White is not the same as Non-Hispanic White. Census asks 1 question about race (Are you white, Black, [other options]?) And one about ethnicity (Are you Hispanic or non-Hispanic?).

Many people identify as both white and Hispanic.

I'm not sure how we want to handle this -- I don't think 2010 decennial reports the crosstabs of both white and non-Hispanic. So this is probably good enough. But maybe we make a note?

self.HISPANIC_OR_LATINO_FIELD = "PCT086006" # Total!!Hispanic or Latino
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.TWO_OR_MORE_RACES_FIELD = (
"P004024" # Total!!Two or More Ethnic Origins or RaceTotal
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
)
self.OTHER_ETHNIC_ORIGIN_FIELD = (
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
"PCT086007" # Total!!Other Ethnic Origin or Ra
)

self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
self.BLACK_VI_FIELD = (
"P003003" # Total!!One race!!Black or African American alone
)
self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone
self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone
self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races
self.NON_HISPANIC_WHITE_VI_FIELD = (
"P005006" # Total!!Not Hispanic or Latino!!One race!!White alone
)
self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino
self.OTHER_RACE_VI_FIELD = (
"P003008" # Total!!One race!!Some Other Race alone
)
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total

self.TOTAL_RACE_POPULATION_FIELD_NAME = (
"Total population surveyed on racial data"
)
self.BLACK_FIELD_NAME = "Black or African American"
self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native"
self.ASIAN_FIELD_NAME = "Asian"
self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific"
self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races"
self.NON_HISPANIC_WHITE_FIELD_NAME = "White"
self.HISPANIC_FIELD_NAME = "Hispanic or Latino"
# Note that `other` is lowercase because the whole field will show up in the download
# file as "Percent other races"
self.OTHER_RACE_FIELD_NAME = "other races"

# Name output demographics fields.
self.RE_OUTPUT_FIELDS = [
self.BLACK_FIELD_NAME,
self.AMERICAN_INDIAN_FIELD_NAME,
self.ASIAN_FIELD_NAME,
self.HAWAIIAN_FIELD_NAME,
self.TWO_OR_MORE_RACES_FIELD_NAME,
self.NON_HISPANIC_WHITE_FIELD_NAME,
self.HISPANIC_FIELD_NAME,
self.OTHER_RACE_FIELD_NAME,
]

var_list = [
self.MEDIAN_INCOME_FIELD,
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD,
Expand All @@ -162,6 +222,14 @@ def __init__(self):
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.TOTAL_POP_FIELD,
self.TOTAL_RACE_POPULATION_FIELD,
self.ASIAN_FIELD,
self.TWO_OR_MORE_RACES_FIELD,
self.BLACK_OR_AA_FIELD,
self.NATIVE_HI_OR_API_FIELD,
self.WHITE_FIELD,
self.HISPANIC_OR_LATINO_FIELD,
self.OTHER_ETHNIC_ORIGIN_FIELD,
]
var_list = ",".join(var_list)

Expand All @@ -180,6 +248,15 @@ def __init__(self):
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
self.TOTAL_POP_VI_FIELD,
self.BLACK_VI_FIELD,
self.AMERICAN_INDIAN_VI_FIELD,
self.ASIAN_VI_FIELD,
self.HAWAIIAN_VI_FIELD,
self.TWO_OR_MORE_RACES_VI_FIELD,
self.NON_HISPANIC_WHITE_VI_FIELD,
self.HISPANIC_VI_FIELD,
self.OTHER_RACE_VI_FIELD,
self.TOTAL_RACE_POPULATION_VI_FIELD,
]
var_list_vi = ",".join(var_list_vi)

Expand Down Expand Up @@ -210,6 +287,23 @@ def __init__(self):
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME,
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.ASIAN_FIELD: self.ASIAN_FIELD_NAME,
self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME,
self.BLACK_OR_AA_FIELD: self.BLACK_FIELD_NAME,
self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME,
self.NATIVE_HI_OR_API_FIELD: self.HAWAIIAN_FIELD_NAME,
self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME,
self.TWO_OR_MORE_RACES_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME,
self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

want to add a note like `# Note there are no Two or more races data for AS/GU/MI

self.WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
self.HISPANIC_OR_LATINO_FIELD: self.HISPANIC_FIELD_NAME,
self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME,
self.OTHER_ETHNIC_ORIGIN_FIELD: self.OTHER_RACE_FIELD_NAME,
self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME,
}

# To do: Ask Census Slack Group about whether you need to hardcode the county fips
Expand Down Expand Up @@ -252,6 +346,8 @@ def __init__(self):
+ "&for=tract:*&in=state:{}%20county:{}"
)

self.final_race_fields: List[str] = []

self.df: pd.DataFrame
self.df_vi: pd.DataFrame
self.df_all: pd.DataFrame
Expand All @@ -264,14 +360,16 @@ def extract(self) -> None:
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
api_url = self.API_URL.format(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

self.DECENNIAL_YEAR,
island["state_abbreviation"],
island["var_list"],
island["fips"],
county,
)
logger.debug(f"CENSUS: Requesting {api_url}")
download = requests.get(
self.API_URL.format(
self.DECENNIAL_YEAR,
island["state_abbreviation"],
island["var_list"],
island["fips"],
county,
),
api_url,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)

Expand Down Expand Up @@ -379,6 +477,19 @@ def transform(self) -> None:
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
)

# Calculate stats by race
for race_field_name in self.RE_OUTPUT_FIELDS:
output_field_name = (
field_names.PERCENT_PREFIX
+ race_field_name
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX
)
self.final_race_fields.append(output_field_name)
self.df_all[output_field_name] = (
self.df_all[race_field_name]
/ self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME]
)

# Reporting Missing Values
for col in self.df_all.columns:
missing_value_count = self.df_all[col].isnull().sum()
Expand All @@ -402,7 +513,7 @@ def load(self) -> None:
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
self.UNEMPLOYMENT_FIELD_NAME,
]
] + self.final_race_fields

self.df_all[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
Expand Down
1 change: 1 addition & 0 deletions data/data-pipeline/data_pipeline/score/field_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
ISLAND_AREA_BACKFILL_SUFFIX = " in 2009"

# Geographic field names
GEOID_TRACT_FIELD = "GEOID10_TRACT"
Expand Down
1 change: 0 additions & 1 deletion data/data-pipeline/data_pipeline/score/score_narwhal.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,6 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:

def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score Narhwal")

self.df[field_names.THRESHOLD_COUNT] = 0

self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (
Expand Down
31 changes: 29 additions & 2 deletions data/data-pipeline/data_pipeline/tests/score/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import pytest
import pandas as pd
import numpy as np
from data_pipeline.etl.score import constants
from data_pipeline.score import field_names
from data_pipeline.score.field_names import GEOID_TRACT_FIELD
from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
from .fixtures import (
final_score_df,
ejscreen_df,
Expand Down Expand Up @@ -287,7 +289,24 @@ def test_data_sources(

# Make sure we have NAs for any tracts in the final data that aren't
# included in the data source
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
has_additional_non_null_tracts = not np.all(
df[df.MERGE == "left_only"][final_columns].isna()
)
if has_additional_non_null_tracts:
# We backfill island areas with data from the 2010 census, so if THOSE tracts
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice comment!! and nice test.

# have data beyond the data source, that's to be expected and is fine to pass.
# If some other state or territory does though, this should fail
left_only = df.loc[(df.MERGE == "left_only")]
left_only_has_value = left_only.loc[
~df[final_columns].isna().all(axis=1)
]
fips_with_values = set(
left_only_has_value[field_names.GEOID_TRACT_FIELD].str[0:2]
)
non_island_fips_codes = fips_with_values.difference(
TILES_ISLAND_AREA_FIPS_CODES
)
assert not non_island_fips_codes

# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
# has moved to 2020 tracts
Expand Down Expand Up @@ -365,8 +384,16 @@ def test_imputed_tracts(final_score_df):
)

# Make sure that no tracts with population have null imputed income
# We DO NOT impute income
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
is_island_area = (
final_score_df[field_names.GEOID_TRACT_FIELD]
.str[:2]
.isin(constants.TILES_ISLAND_AREA_FIPS_CODES)
)

tracts_with_some_population_df = final_score_df[
final_score_df[field_names.TOTAL_POP_FIELD] > 0
(final_score_df[field_names.TOTAL_POP_FIELD] > 0)
& ~is_island_area
]
assert (
not tracts_with_some_population_df[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def test_tract_id_lengths(self, mock_etl, mock_paths):
"data_pipeline.etl.sources.eamlis.etl.add_tracts_for_geometries",
new=_fake_add_tracts_for_geometries,
):
super().test_tract_id_lengths(mock_etl, mock_paths)
super().test_tract_id_lengths(mock_etl, mock_paths)