Skip to content

Commit

Permalink
Revert "Fast flag update (#1844)"
Browse files Browse the repository at this point in the history
This reverts commit d892bce.
  • Loading branch information
emma-nechamkin authored Aug 19, 2022
1 parent d892bce commit 5c41c95
Show file tree
Hide file tree
Showing 14 changed files with 29 additions and 61 deletions.
16 changes: 8 additions & 8 deletions data/data-pipeline/data_pipeline/content/config/csv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ fields:
label: Total categories exceeded
format: int64
- score_name: Definition N (communities)
label: Identified as disadvantaged without considering neighbors
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (based on adjacency index and low income alone)
label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
- score_name: Definition N (communities) (including adjacency index)
label: Identified as disadvantaged (including adjacency index)
format: bool
- score_name: Definition M community, including adjacency index tracts
label: Identified as disadvantaged
- score_name: Is the tract surrounded by disadvantaged communities?
label: Is the tract surrounded by disadvantaged communities?
format: bool
- score_name: Meets the less stringent low income criterion for the adjacency index?
label: Meets the less stringent low income criterion for the adjacency index?
format: bool
- score_name: Definition N (communities) (average of neighbors)
label: Share of neighbors that are identified as disadvantaged
Expand Down Expand Up @@ -338,6 +341,3 @@ fields:
- score_name: Tract-level redlining score meets or exceeds 3.25
label: Tract experienced historic underinvestment
format: bool
- score_name: Income data has been estimated based on neighbor income
label: Income data has been estimated based on geographic neighbor income
format: bool
16 changes: 8 additions & 8 deletions data/data-pipeline/data_pipeline/content/config/excel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@ sheets:
label: Total categories exceeded
format: int64
- score_name: Definition N (communities)
label: Identified as disadvantaged without considering neighbors
label: Identified as disadvantaged
format: bool
- score_name: Definition N (communities) (based on adjacency index and low income alone)
label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
- score_name: Definition N (communities) (including adjacency index)
label: Identified as disadvantaged (including adjacency index)
format: bool
- score_name: Definition M community, including adjacency index tracts
label: Identified as disadvantaged
- score_name: Is the tract surrounded by disadvantaged communities?
label: Is the tract surrounded by disadvantaged communities?
format: bool
- score_name: Meets the less stringent low income criterion for the adjacency index?
label: Meets the less stringent low income criterion for the adjacency index?
format: bool
- score_name: Definition N (communities) (average of neighbors)
label: Share of neighbors that are identified as disadvantaged
Expand Down Expand Up @@ -342,6 +345,3 @@ sheets:
- score_name: Tract-level redlining score meets or exceeds 3.25
label: Tract experienced historic underinvestment
format: bool
- score_name: Income data has been estimated based on neighbor income
label: Income data has been estimated based on geographic neighbor income
format: bool
8 changes: 3 additions & 5 deletions data/data-pipeline/data_pipeline/etl/score/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,10 +208,9 @@
field_names.M_HEALTH: "M_HLTH",
# temporarily update this so that it's the Narwhal score that gets visualized on the map
# The NEW final score value INCLUDES the adjacency index.
field_names.FINAL_SCORE_N_BOOLEAN: "SM_C",
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
field_names.SCORE_N_COMMUNITIES
+ field_names.ADJACENT_MEAN_SUFFIX: "SM_DON",
field_names.SCORE_N_COMMUNITIES: "SM_NO_DON",
+ field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
Expand Down Expand Up @@ -314,8 +313,7 @@
+ field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
field_names.AML_BOOLEAN: "AML_ET",
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG"
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
## FPL_200 (there is no higher ed in narwhal)
}
Expand Down
1 change: 0 additions & 1 deletion data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,6 @@ def _prepare_initial_df(self) -> pd.DataFrame:
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
]

# For some columns, high values are "good", so we want to reverse the percentile
Expand Down
2 changes: 2 additions & 0 deletions data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ def _load_tile_csv(
score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")

def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
logger.info("Saving Downloadable CSV")

downloadable_info_path.mkdir(parents=True, exist_ok=True)
csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
Expand Down

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,6 @@ def __init__(self):
self.COLLEGE_ATTENDANCE_FIELD,
self.COLLEGE_NON_ATTENDANCE_FIELD,
self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
]
+ self.RE_OUTPUT_FIELDS
+ [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
Expand Down Expand Up @@ -504,13 +503,6 @@ def transform(self) -> None:
}
)

# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
# This allows us to see which tracts have an imputed income.
df[field_names.IMPUTED_INCOME_FLAG_FIELD_NAME] = (
df[field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD].notna()
& df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
)

# Strip columns and save results to self.
self.df = df[self.COLUMNS_TO_KEEP]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,12 @@ def calculate_income_measures(
)

# Iterate through the dataframe to impute in place
## TODO: We should probably convert this to a spatial join now that we are doing >1 imputation and it's taking a lot
## of time, but thinking through how to do this while maintaining the masking will take some time. I think the best
## way would be to (1) spatial join to all neighbors, and then (2) iterate to take the "smallest" set of neighbors...
## but haven't implemented it yet.
for index, row in geo_df.iterrows():
if row[geoid_field] in tract_list:
neighbor_mask = _get_neighbor_mask(geo_df, row)
county_mask = _get_fips_mask(
geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
)
## TODO: Did CEQ decide to cut this?
state_mask = _get_fips_mask(
geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
)
Expand Down
8 changes: 1 addition & 7 deletions data/data-pipeline/data_pipeline/score/field_names.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Suffixes
PERCENTILE_FIELD_SUFFIX = " (percentile)"
ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"

# Geographic field names
Expand All @@ -12,9 +12,6 @@
# Score file field names
# Definition M fields
SCORE_M = "Definition M"
FINAL_SCORE_N_BOOLEAN = (
"Definition M community, including adjacency index tracts"
)
SCORE_M_COMMUNITIES = "Definition M (communities)"
M_CLIMATE = "Climate Factor (Definition M)"
M_ENERGY = "Energy Factor (Definition M)"
Expand Down Expand Up @@ -70,9 +67,6 @@

# this is what gets used in the score
POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
IMPUTED_INCOME_FLAG_FIELD_NAME = (
"Income data has been estimated based on neighbor income"
)
POVERTY_LESS_THAN_150_FPL_FIELD = (
"Percent of individuals < 150% Federal Poverty Line"
)
Expand Down
20 changes: 4 additions & 16 deletions data/data-pipeline/data_pipeline/score/score_narwhal.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,10 +385,8 @@ def _housing_factor(self) -> bool:

# Kitchen / plumbing
self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD] = (
self.df[
field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)

Expand Down Expand Up @@ -973,25 +971,15 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:
>= self.SCORE_THRESHOLD_DONUT
)

# This constructs the boolean for whether it's a donut hole community
# This can also be true when the tract itself is a DAC on its own
# This should be the "final list" of Score Narwhal communities, meaning that we would
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
self.df[
field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
] = (
self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
& self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
)

# This should be the "final list" of Score Narwhal communities, meaning that we would
# expect this to be True if either the tract is a donut hole community OR the tract is a DAC
self.df[field_names.FINAL_SCORE_N_BOOLEAN] = (
self.df[field_names.SCORE_N_COMMUNITIES]
| self.df[
field_names.SCORE_N_COMMUNITIES
+ field_names.ADJACENT_MEAN_SUFFIX
]
)

def add_columns(self) -> pd.DataFrame:
logger.info("Adding Score Narhwal")

Expand Down

0 comments on commit 5c41c95

Please sign in to comment.