Revert "Fast flag update (#1844)"

This reverts commit d892bce.
usds · Aug 19, 2022 · 5c41c95 · 5c41c95
1 parent d892bce
commit 5c41c95
Show file tree

Hide file tree

Showing 14 changed files with 29 additions and 61 deletions.
diff --git a/data/data-pipeline/data_pipeline/content/config/csv.yml b/data/data-pipeline/data_pipeline/content/config/csv.yml
@@ -21,13 +21,16 @@ fields:
     label: Total categories exceeded
     format: int64
   - score_name: Definition N (communities)
-    label: Identified as disadvantaged without considering neighbors
+    label: Identified as disadvantaged
     format: bool
-  - score_name: Definition N (communities) (based on adjacency index and low income alone)
-    label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
+  - score_name: Definition N (communities) (including adjacency index)
+    label: Identified as disadvantaged (including adjacency index)
     format: bool
-  - score_name: Definition M community, including adjacency index tracts
-    label: Identified as disadvantaged
+  - score_name: Is the tract surrounded by disadvantaged communities?
+    label: Is the tract surrounded by disadvantaged communities?
+    format: bool
+  - score_name: Meets the less stringent low income criterion for the adjacency index?
+    label: Meets the less stringent low income criterion for the adjacency index?
     format: bool
   - score_name: Definition N (communities) (average of neighbors)
     label: Share of neighbors that are identified as disadvantaged
@@ -338,6 +341,3 @@ fields:
   - score_name: Tract-level redlining score meets or exceeds 3.25
     label: Tract experienced historic underinvestment
     format: bool
-  - score_name: Income data has been estimated based on neighbor income
-    label: Income data has been estimated based on geographic neighbor income
-    format: bool
diff --git a/data/data-pipeline/data_pipeline/content/config/excel.yml b/data/data-pipeline/data_pipeline/content/config/excel.yml
@@ -25,13 +25,16 @@ sheets:
         label: Total categories exceeded
         format: int64
       - score_name: Definition N (communities)
-        label: Identified as disadvantaged without considering neighbors
+        label: Identified as disadvantaged
         format: bool
-      - score_name: Definition N (communities) (based on adjacency index and low income alone)
-        label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
+      - score_name: Definition N (communities) (including adjacency index)
+        label: Identified as disadvantaged (including adjacency index)
         format: bool
-      - score_name: Definition M community, including adjacency index tracts
-        label: Identified as disadvantaged
+      - score_name: Is the tract surrounded by disadvantaged communities?
+        label: Is the tract surrounded by disadvantaged communities?
+        format: bool
+      - score_name: Meets the less stringent low income criterion for the adjacency index?
+        label: Meets the less stringent low income criterion for the adjacency index?
         format: bool
       - score_name: Definition N (communities) (average of neighbors)
         label: Share of neighbors that are identified as disadvantaged
@@ -342,6 +345,3 @@ sheets:
       - score_name: Tract-level redlining score meets or exceeds 3.25
         label: Tract experienced historic underinvestment
         format: bool
-      - score_name: Income data has been estimated based on neighbor income
-        label: Income data has been estimated based on geographic neighbor income
-        format: bool
diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -208,10 +208,9 @@
     field_names.M_HEALTH: "M_HLTH",
     # temporarily update this so that it's the Narwhal score that gets visualized on the map
     # The NEW final score value INCLUDES the adjacency index.
-    field_names.FINAL_SCORE_N_BOOLEAN: "SM_C",
+    field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
     field_names.SCORE_N_COMMUNITIES
-    + field_names.ADJACENT_MEAN_SUFFIX: "SM_DON",
-    field_names.SCORE_N_COMMUNITIES: "SM_NO_DON",
+    + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
     field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
     field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
     field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
@@ -314,8 +313,7 @@
     + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
     field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
     field_names.AML_BOOLEAN: "AML_ET",
-    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
-    field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG"
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
     ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
     ## FPL_200 (there is no higher ed in narwhal)
 }

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -471,7 +471,6 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
             field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
             field_names.AML_BOOLEAN,
-            field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
         ]
 
         # For some columns, high values are "good", so we want to reverse the percentile

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -521,6 +521,8 @@ def _load_tile_csv(
         score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
 
     def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
+        logger.info("Saving Downloadable CSV")
+
         downloadable_info_path.mkdir(parents=True, exist_ok=True)
         csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
         excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH

diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -227,7 +227,6 @@ def __init__(self):
                 self.COLLEGE_ATTENDANCE_FIELD,
                 self.COLLEGE_NON_ATTENDANCE_FIELD,
                 self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
-                field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
             ]
             + self.RE_OUTPUT_FIELDS
             + [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
@@ -504,13 +503,6 @@ def transform(self) -> None:
             }
         )
 
-        # We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
-        # This allows us to see which tracts have an imputed income. 
-        df[field_names.IMPUTED_INCOME_FLAG_FIELD_NAME] = (
-            df[field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD].notna()
-            & df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
-        )
-
         # Strip columns and save results to self.
         self.df = df[self.COLUMNS_TO_KEEP]
 

diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@@ -92,17 +92,12 @@ def calculate_income_measures(
     )
 
     # Iterate through the dataframe to impute in place
-    ## TODO: We should probably convert this to a spatial join now that we are doing >1 imputation and it's taking a lot
-    ## of time, but thinking through how to do this while maintaining the masking will take some time. I think the best
-    ## way would be to (1) spatial join to all neighbors, and then (2) iterate to take the "smallest" set of neighbors...
-    ## but haven't implemented it yet.
     for index, row in geo_df.iterrows():
         if row[geoid_field] in tract_list:
             neighbor_mask = _get_neighbor_mask(geo_df, row)
             county_mask = _get_fips_mask(
                 geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
             )
-            ## TODO: Did CEQ decide to cut this?
             state_mask = _get_fips_mask(
                 geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
             )

diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -1,7 +1,7 @@
 # Suffixes
 PERCENTILE_FIELD_SUFFIX = " (percentile)"
 ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
-ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
+ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
 ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
 
 # Geographic field names
@@ -12,9 +12,6 @@
 # Score file field names
 # Definition M fields
 SCORE_M = "Definition M"
-FINAL_SCORE_N_BOOLEAN = (
-    "Definition M community, including adjacency index tracts"
-)
 SCORE_M_COMMUNITIES = "Definition M (communities)"
 M_CLIMATE = "Climate Factor (Definition M)"
 M_ENERGY = "Energy Factor (Definition M)"
@@ -70,9 +67,6 @@
 
 # this is what gets used in the score
 POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
-IMPUTED_INCOME_FLAG_FIELD_NAME = (
-    "Income data has been estimated based on neighbor income"
-)
 POVERTY_LESS_THAN_150_FPL_FIELD = (
     "Percent of individuals < 150% Federal Poverty Line"
 )

diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@@ -385,10 +385,8 @@ def _housing_factor(self) -> bool:
 
         # Kitchen / plumbing
         self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD] = (
-            self.df[
-                field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
-                + field_names.PERCENTILE_FIELD_SUFFIX
-            ]
+            self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD 
+                    + field_names.PERCENTILE_FIELD_SUFFIX]
             >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
         )
 
@@ -973,25 +971,15 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:
             >= self.SCORE_THRESHOLD_DONUT
         )
 
-        # This constructs the boolean for whether it's a donut hole community
-        # This can also be true when the tract itself is a DAC on its own
+        # This should be the "final list" of Score Narwhal communities, meaning that we would
+        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
         self.df[
             field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
         ] = (
             self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
             & self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
         )
 
-        # This should be the "final list" of Score Narwhal communities, meaning that we would
-        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
-        self.df[field_names.FINAL_SCORE_N_BOOLEAN] = (
-            self.df[field_names.SCORE_N_COMMUNITIES]
-            | self.df[
-                field_names.SCORE_N_COMMUNITIES
-                + field_names.ADJACENT_MEAN_SUFFIX
-            ]
-        )
-
     def add_columns(self) -> pd.DataFrame:
         logger.info("Adding Score Narhwal")