Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast flag update #1844

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions data/data-pipeline/data_pipeline/content/config/csv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,13 @@ fields:
label: Total categories exceeded
format: int64
- score_name: Definition N (communities)
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (including adjacency index)
label: Identified as disadvantaged (including adjacency index)
label: Identified as disadvantaged without considering neighbors
format: bool
- score_name: Is the tract surrounded by disadvantaged communities?
label: Is the tract surrounded by disadvantaged communities?
- score_name: Definition N (communities) (based on adjacency index and low income alone)
label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
format: bool
- score_name: Meets the less stringent low income criterion for the adjacency index?
label: Meets the less stringent low income criterion for the adjacency index?
- score_name: Definition M community, including adjacency index tracts
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (average of neighbors)
label: Share of neighbors that are identified as disadvantaged
Expand Down Expand Up @@ -341,3 +338,6 @@ fields:
- score_name: Tract-level redlining score meets or exceeds 3.25
label: Tract experienced historic underinvestment
format: bool
- score_name: Income data has been estimated based on neighbor income
label: Income data has been estimated based on geographic neighbor income
format: bool
16 changes: 8 additions & 8 deletions data/data-pipeline/data_pipeline/content/config/excel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,13 @@ sheets:
label: Total categories exceeded
format: int64
- score_name: Definition N (communities)
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (including adjacency index)
label: Identified as disadvantaged (including adjacency index)
label: Identified as disadvantaged without considering neighbors
format: bool
- score_name: Is the tract surrounded by disadvantaged communities?
label: Is the tract surrounded by disadvantaged communities?
- score_name: Definition N (communities) (based on adjacency index and low income alone)
label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
format: bool
- score_name: Meets the less stringent low income criterion for the adjacency index?
label: Meets the less stringent low income criterion for the adjacency index?
- score_name: Definition M community, including adjacency index tracts
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (average of neighbors)
label: Share of neighbors that are identified as disadvantaged
Expand Down Expand Up @@ -345,3 +342,6 @@ sheets:
- score_name: Tract-level redlining score meets or exceeds 3.25
label: Tract experienced historic underinvestment
format: bool
- score_name: Income data has been estimated based on neighbor income
label: Income data has been estimated based on geographic neighbor income
format: bool
8 changes: 5 additions & 3 deletions data/data-pipeline/data_pipeline/etl/score/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,10 @@
field_names.M_HEALTH: "M_HLTH",
# temporarily update this so that it's the Narwhal score that gets visualized on the map
# The NEW final score value INCLUDES the adjacency index.
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
field_names.FINAL_SCORE_N_BOOLEAN: "SM_C",
field_names.SCORE_N_COMMUNITIES
+ field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
+ field_names.ADJACENT_MEAN_SUFFIX: "SM_DON",
field_names.SCORE_N_COMMUNITIES: "SM_NO_DON",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
Expand Down Expand Up @@ -313,7 +314,8 @@
+ field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
field_names.AML_BOOLEAN: "AML_ET",
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG"
## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
## FPL_200 (there is no higher ed in narwhal)
}
Expand Down
1 change: 1 addition & 0 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
]

# For some columns, high values are "good", so we want to reverse the percentile
Expand Down
2 changes: 0 additions & 2 deletions data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,6 @@ def _load_tile_csv(
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")

def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
logger.info("Saving Downloadable CSV")

downloadable_info_path.mkdir(parents=True, exist_ok=True)
csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
Expand Down

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def __init__(self):
self.COLLEGE_ATTENDANCE_FIELD,
self.COLLEGE_NON_ATTENDANCE_FIELD,
self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
]
+ self.RE_OUTPUT_FIELDS
+ [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
Expand Down Expand Up @@ -503,6 +504,13 @@ def transform(self) -> None:
}
)

# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
# This allows us to see which tracts have an imputed income.
df[field_names.IMPUTED_INCOME_FLAG_FIELD_NAME] = (
df[field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD].notna()
& df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
)

# Strip columns and save results to self.
self.df = df[self.COLUMNS_TO_KEEP]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,17 @@ def calculate_income_measures(
)

# Iterate through the dataframe to impute in place
## TODO: We should probably convert this to a spatial join now that we are doing >1 imputation and it's taking a lot
## of time, but thinking through how to do this while maintaining the masking will take some time. I think the best
## way would be to (1) spatial join to all neighbors, and then (2) iterate to take the "smallest" set of neighbors...
## but haven't implemented it yet.
for index, row in geo_df.iterrows():
if row[geoid_field] in tract_list:
neighbor_mask = _get_neighbor_mask(geo_df, row)
county_mask = _get_fips_mask(
geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
)
## TODO: Did CEQ decide to cut this?
state_mask = _get_fips_mask(
geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
)
Expand Down
8 changes: 7 additions & 1 deletion data/data-pipeline/data_pipeline/score/field_names.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Suffixes
PERCENTILE_FIELD_SUFFIX = " (percentile)"
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"

# Geographic field names
Expand All @@ -12,6 +12,9 @@
# Score file field names
# Definition M fields
SCORE_M = "Definition M"
FINAL_SCORE_N_BOOLEAN = (
"Definition M community, including adjacency index tracts"
)
SCORE_M_COMMUNITIES = "Definition M (communities)"
M_CLIMATE = "Climate Factor (Definition M)"
M_ENERGY = "Energy Factor (Definition M)"
Expand Down Expand Up @@ -67,6 +70,9 @@

# this is what gets used in the score
POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
IMPUTED_INCOME_FLAG_FIELD_NAME = (
"Income data has been estimated based on neighbor income"
)
POVERTY_LESS_THAN_150_FPL_FIELD = (
"Percent of individuals < 150% Federal Poverty Line"
)
Expand Down
20 changes: 16 additions & 4 deletions data/data-pipeline/data_pipeline/score/score_narwhal.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,10 @@ def _housing_factor(self) -> bool:

# Kitchen / plumbing
self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD] = (
self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX]
self.df[
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)

Expand Down Expand Up @@ -971,15 +973,25 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:
>= self.SCORE_THRESHOLD_DONUT
)

# This should be the "final list" of Score Narwhal communities, meaning that we would
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
# This constructs the boolean for whether it's a donut hole community
# This can also be true when the tract itself is a DAC on its own
self.df[
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
] = (
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
& self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
)

# This should be the "final list" of Score Narwhal communities, meaning that we would
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
self.df[field_names.FINAL_SCORE_N_BOOLEAN] = (
self.df[field_names.SCORE_N_COMMUNITIES]
| self.df[
field_names.SCORE_N_COMMUNITIES
+ field_names.ADJACENT_MEAN_SUFFIX
]
)

def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score Narhwal")

Expand Down