Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add demos for island areas #1932

4 changes: 1 addition & 3 deletions data/data-pipeline/data_pipeline/etl/score/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,6 @@
field_names.PERCENT_AGE_OVER_64: "AGE_OLD",
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT: "TA_COUNT",
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC",


}

# columns to round floats to 2 decimals
Expand Down Expand Up @@ -456,5 +454,5 @@
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
]
54 changes: 53 additions & 1 deletion data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import functools
from typing import List

from dataclasses import dataclass

import numpy as np
Expand Down Expand Up @@ -56,6 +58,8 @@ def __init__(self):
self.fuds_df: pd.DataFrame
self.tribal_overlap_df: pd.DataFrame

self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

def extract(self) -> None:
logger.info("Loading data sets from disk.")

Expand Down Expand Up @@ -402,6 +406,25 @@ def _prepare_initial_df(self) -> pd.DataFrame:
df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD]
)

self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS = [
field_names.PERCENT_BLACK_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_ASIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_HAWAIIAN_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_HISPANIC_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
field_names.PERCENT_OTHER_RACE_FIELD_NAME
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX,
]

# Donut columns get added later
numeric_columns = [
field_names.HOUSING_BURDEN_FIELD,
Expand Down Expand Up @@ -471,7 +494,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
field_names.PERCENT_AGE_OVER_64,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
]
] + self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS

non_numeric_columns = [
self.GEOID_TRACT_FIELD_NAME,
Expand Down Expand Up @@ -636,6 +659,32 @@ class ReversePercentile:

return df_copy

@staticmethod
def _get_island_areas(df: pd.DataFrame) -> pd.Series:
return (
df[field_names.GEOID_TRACT_FIELD]
.str[:2]
.isin(constants.TILES_ISLAND_AREA_FIPS_CODES)
)

def _backfill_island_demographics(self, df: pd.DataFrame) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worth adding a test for this? it seems like it might be straightforward to test and would add a lot of confidence.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call --- done in 403c55a

It is an ugly test, but it is a test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I was thinking a test directly of _backfill_island_demographics (just give it a simple input DF and test the output DF), but your test works!

logger.info("Backfilling island demographic data")
island_index = self._get_island_areas(df)
for backfill_field_name in self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS:
actual_field_name = backfill_field_name.replace(
field_names.ISLAND_AREA_BACKFILL_SUFFIX, ""
)
df.loc[island_index, actual_field_name] = df.loc[
island_index, backfill_field_name
]
df = df.drop(columns=self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS)

df.loc[island_index, field_names.TOTAL_POP_FIELD] = df.loc[
island_index, field_names.COMBINED_CENSUS_TOTAL_POPULATION_2010
]

return df

def transform(self) -> None:
logger.info("Transforming Score Data")

Expand All @@ -645,6 +694,9 @@ def transform(self) -> None:
# calculate scores
self.df = ScoreRunner(df=self.df).calculate_scores()

# We add island demographic data since it doesn't matter to the score anyway
self.df = self._backfill_island_demographics(self.df)

def load(self) -> None:
logger.info("Saving Score CSV")
constants.DATA_SCORE_CSV_FULL_DIR.mkdir(parents=True, exist_ok=True)
Expand Down
126 changes: 118 additions & 8 deletions data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import List
import requests

import numpy as np
Expand Down Expand Up @@ -147,6 +148,65 @@ def __init__(self):
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009
)

# Race/Ethnicity fields
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lucasmbrown-usds I'd love your feedback on how I mapped these various variables back to the main results. I tried to follow similar logic as was in the ACS ETL, and also it's subjective so I'm happy to make any changes.

self.TOTAL_RACE_POPULATION_FIELD = "PCT086001" # Total
self.ASIAN_FIELD = "PCT086002" # Total!!Asian
self.BLACK_FIELD = "PCT086003" # Total!!Black or African American
self.HAWAIIAN_FIELD = (
"PCT086004" # Total!!Native Hawaiian and Other Pacific Islander
)
# Note that the 2010 census for island araeas does not break out
# hispanic and non-hispanic white, so this is slightly different from
# our other demographic data
self.NON_HISPANIC_WHITE_FIELD = "PCT086005" # Total!!White
self.HISPANIC_FIELD = "PCT086006" # Total!!Hispanic or Latino
self.OTHER_RACE_FIELD = (
"PCT086007" # Total!!Other Ethnic Origin or Ra
)

self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total
self.BLACK_VI_FIELD = (
"P003003" # Total!!One race!!Black or African American alone
)
self.AMERICAN_INDIAN_VI_FIELD = "P003005" # Total!!One race!!American Indian and Alaska Native alone
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.ASIAN_VI_FIELD = "P003006" # Total!!One race!!Asian alone
self.HAWAIIAN_VI_FIELD = "P003007" # Total!!One race!!Native Hawaiian and Other Pacific Islander alone
self.TWO_OR_MORE_RACES_VI_FIELD = "P003009" # Total!!Two or More Races
self.NON_HISPANIC_WHITE_VI_FIELD = (
"P005006" # Total!!Not Hispanic or Latino!!One race!!White alone
)
self.HISPANIC_VI_FIELD = "P005002" # Total!!Hispanic or Latino
self.OTHER_RACE_VI_FIELD = (
"P003008" # Total!!One race!!Some Other Race alone
)
self.TOTAL_RACE_POPULATION_VI_FIELD = "P003001" # Total

self.TOTAL_RACE_POPULATION_FIELD_NAME = (
"Total population surveyed on racial data"
)
self.BLACK_FIELD_NAME = "Black or African American"
self.AMERICAN_INDIAN_FIELD_NAME = "American Indian / Alaska Native"
self.ASIAN_FIELD_NAME = "Asian"
self.HAWAIIAN_FIELD_NAME = "Native Hawaiian or Pacific"
self.TWO_OR_MORE_RACES_FIELD_NAME = "two or more races"
self.NON_HISPANIC_WHITE_FIELD_NAME = "White"
self.HISPANIC_FIELD_NAME = "Hispanic or Latino"
# Note that `other` is lowercase because the whole field will show up in the download
# file as "Percent other races"
self.OTHER_RACE_FIELD_NAME = "other races"

# Name output demographics fields.
self.RE_OUTPUT_FIELDS = [
self.BLACK_FIELD_NAME,
self.AMERICAN_INDIAN_FIELD_NAME,
self.ASIAN_FIELD_NAME,
self.HAWAIIAN_FIELD_NAME,
self.TWO_OR_MORE_RACES_FIELD_NAME,
self.NON_HISPANIC_WHITE_FIELD_NAME,
self.HISPANIC_FIELD_NAME,
self.OTHER_RACE_FIELD_NAME,
]

var_list = [
self.MEDIAN_INCOME_FIELD,
self.TOTAL_HOUSEHOLD_RATIO_INCOME_TO_POVERTY_LEVEL_FIELD,
Expand All @@ -162,6 +222,13 @@ def __init__(self):
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.TOTAL_POP_FIELD,
self.TOTAL_RACE_POPULATION_FIELD,
self.ASIAN_FIELD,
self.BLACK_FIELD,
self.HAWAIIAN_FIELD,
self.NON_HISPANIC_WHITE_FIELD,
self.HISPANIC_FIELD,
self.OTHER_RACE_FIELD,
]
var_list = ",".join(var_list)

Expand All @@ -180,6 +247,15 @@ def __init__(self):
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_VI_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_VI_FIELD,
self.TOTAL_POP_VI_FIELD,
self.BLACK_VI_FIELD,
self.AMERICAN_INDIAN_VI_FIELD,
self.ASIAN_VI_FIELD,
self.HAWAIIAN_VI_FIELD,
self.TWO_OR_MORE_RACES_VI_FIELD,
self.NON_HISPANIC_WHITE_VI_FIELD,
self.HISPANIC_VI_FIELD,
self.OTHER_RACE_VI_FIELD,
self.TOTAL_RACE_POPULATION_VI_FIELD,
]
var_list_vi = ",".join(var_list_vi)

Expand Down Expand Up @@ -210,6 +286,23 @@ def __init__(self):
self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_MALE_UNEMPLOYED_FIELD,
self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD: self.EMPLOYMENT_FEMALE_IN_LABOR_FORCE_FIELD,
self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD: self.EMPLOYMENT_FEMALE_UNEMPLOYED_FIELD,
self.TOTAL_RACE_POPULATION_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
self.TOTAL_RACE_POPULATION_VI_FIELD: self.TOTAL_RACE_POPULATION_FIELD_NAME,
# Note there is no American Indian data for AS/GU/MI
self.AMERICAN_INDIAN_VI_FIELD: self.AMERICAN_INDIAN_FIELD_NAME,
mattbowen-usds marked this conversation as resolved.
Show resolved Hide resolved
self.ASIAN_FIELD: self.ASIAN_FIELD_NAME,
self.ASIAN_VI_FIELD: self.ASIAN_FIELD_NAME,
self.BLACK_FIELD: self.BLACK_FIELD_NAME,
self.BLACK_VI_FIELD: self.BLACK_FIELD_NAME,
self.HAWAIIAN_FIELD: self.HAWAIIAN_FIELD_NAME,
self.HAWAIIAN_VI_FIELD: self.HAWAIIAN_FIELD_NAME,
self.TWO_OR_MORE_RACES_VI_FIELD: self.TWO_OR_MORE_RACES_FIELD_NAME,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

want to add a note like `# Note there are no Two or more races data for AS/GU/MI

self.NON_HISPANIC_WHITE_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
self.NON_HISPANIC_WHITE_VI_FIELD: self.NON_HISPANIC_WHITE_FIELD_NAME,
self.HISPANIC_FIELD: self.HISPANIC_FIELD_NAME,
self.HISPANIC_VI_FIELD: self.HISPANIC_FIELD_NAME,
self.OTHER_RACE_FIELD: self.OTHER_RACE_FIELD_NAME,
self.OTHER_RACE_VI_FIELD: self.OTHER_RACE_FIELD_NAME,
}

# To do: Ask Census Slack Group about whether you need to hardcode the county fips
Expand Down Expand Up @@ -252,6 +345,8 @@ def __init__(self):
+ "&for=tract:*&in=state:{}%20county:{}"
)

self.final_race_fields: List[str] = []

self.df: pd.DataFrame
self.df_vi: pd.DataFrame
self.df_all: pd.DataFrame
Expand All @@ -264,14 +359,16 @@ def extract(self) -> None:
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
api_url = self.API_URL.format(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

self.DECENNIAL_YEAR,
island["state_abbreviation"],
island["var_list"],
island["fips"],
county,
)
logger.debug(f"CENSUS: Requesting {api_url}")
download = requests.get(
self.API_URL.format(
self.DECENNIAL_YEAR,
island["state_abbreviation"],
island["var_list"],
island["fips"],
county,
),
api_url,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)

Expand Down Expand Up @@ -379,6 +476,19 @@ def transform(self) -> None:
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
)

# Calculate stats by race
for race_field_name in self.RE_OUTPUT_FIELDS:
output_field_name = (
field_names.PERCENT_PREFIX
+ race_field_name
+ field_names.ISLAND_AREA_BACKFILL_SUFFIX
)
self.final_race_fields.append(output_field_name)
self.df_all[output_field_name] = (
self.df_all[race_field_name]
/ self.df_all[self.TOTAL_RACE_POPULATION_FIELD_NAME]
)

# Reporting Missing Values
for col in self.df_all.columns:
missing_value_count = self.df_all[col].isnull().sum()
Expand All @@ -402,7 +512,7 @@ def load(self) -> None:
self.PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL_FIELD_NAME,
self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME,
self.UNEMPLOYMENT_FIELD_NAME,
]
] + self.final_race_fields

self.df_all[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
Expand Down
1 change: 1 addition & 0 deletions data/data-pipeline/data_pipeline/score/field_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
ISLAND_AREA_BACKFILL_SUFFIX = " in 2009"

# Geographic field names
GEOID_TRACT_FIELD = "GEOID10_TRACT"
Expand Down
1 change: 0 additions & 1 deletion data/data-pipeline/data_pipeline/score/score_narwhal.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,6 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:

def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score Narhwal")

self.df[field_names.THRESHOLD_COUNT] = 0

self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] = (
Expand Down
Loading