usds · emma-nechamkin · Aug 19, 2022 · Aug 19, 2022 · Aug 19, 2022 · Aug 19, 2022
diff --git a/data/data-pipeline/data_pipeline/content/config/csv.yml b/data/data-pipeline/data_pipeline/content/config/csv.yml
@@ -21,16 +21,13 @@ fields:
     label: Total categories exceeded
     format: int64
   - score_name: Definition N (communities)
-    label: Identified as disadvantaged
-    format: bool
-  - score_name: Definition N (communities) (including adjacency index)
-    label: Identified as disadvantaged (including adjacency index)
+    label: Identified as disadvantaged without considering neighbors
     format: bool
-  - score_name: Is the tract surrounded by disadvantaged communities?
-    label: Is the tract surrounded by disadvantaged communities?
+  - score_name: Definition N (communities) (based on adjacency index and low income alone)
+    label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
     format: bool
-  - score_name: Meets the less stringent low income criterion for the adjacency index?
-    label: Meets the less stringent low income criterion for the adjacency index?
+  - score_name: Definition M community, including adjacency index tracts
+    label: Identified as disadvantaged
     format: bool
   - score_name: Definition N (communities) (average of neighbors)
     label: Share of neighbors that are identified as disadvantaged
@@ -341,3 +338,6 @@ fields:
   - score_name: Tract-level redlining score meets or exceeds 3.25
     label: Tract experienced historic underinvestment
     format: bool
+  - score_name: Income data has been estimated based on neighbor income
+    label: Income data has been estimated based on geographic neighbor income
+    format: bool
diff --git a/data/data-pipeline/data_pipeline/content/config/excel.yml b/data/data-pipeline/data_pipeline/content/config/excel.yml
@@ -25,16 +25,13 @@ sheets:
         label: Total categories exceeded
         format: int64
       - score_name: Definition N (communities)
-        label: Identified as disadvantaged
-        format: bool
-      - score_name: Definition N (communities) (including adjacency index)
-        label: Identified as disadvantaged (including adjacency index)
+        label: Identified as disadvantaged without considering neighbors
         format: bool
-      - score_name: Is the tract surrounded by disadvantaged communities?
-        label: Is the tract surrounded by disadvantaged communities?
+      - score_name: Definition N (communities) (based on adjacency index and low income alone)
+        label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
         format: bool
-      - score_name: Meets the less stringent low income criterion for the adjacency index?
-        label: Meets the less stringent low income criterion for the adjacency index?
+      - score_name: Definition M community, including adjacency index tracts
+        label: Identified as disadvantaged
         format: bool
       - score_name: Definition N (communities) (average of neighbors)
         label: Share of neighbors that are identified as disadvantaged
@@ -345,3 +342,6 @@ sheets:
       - score_name: Tract-level redlining score meets or exceeds 3.25
         label: Tract experienced historic underinvestment
         format: bool
+      - score_name: Income data has been estimated based on neighbor income
+        label: Income data has been estimated based on geographic neighbor income
+        format: bool
diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -208,9 +208,10 @@
     field_names.M_HEALTH: "M_HLTH",
     # temporarily update this so that it's the Narwhal score that gets visualized on the map
     # The NEW final score value INCLUDES the adjacency index.
-    field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
+    field_names.FINAL_SCORE_N_BOOLEAN: "SM_C",
     field_names.SCORE_N_COMMUNITIES
-    + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
+    + field_names.ADJACENT_MEAN_SUFFIX: "SM_DON",
+    field_names.SCORE_N_COMMUNITIES: "SM_NO_DON",
     field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
     field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
     field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
@@ -313,7 +314,8 @@
     + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
     field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
     field_names.AML_BOOLEAN: "AML_ET",
-    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
+    field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG"
     ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
     ## FPL_200 (there is no higher ed in narwhal)
 }

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -471,6 +471,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
             field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
             field_names.AML_BOOLEAN,
+            field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
         ]
 
         # For some columns, high values are "good", so we want to reverse the percentile

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -521,8 +521,6 @@ def _load_tile_csv(
         score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")
 
     def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
-        logger.info("Saving Downloadable CSV")
-
         downloadable_info_path.mkdir(parents=True, exist_ok=True)
         csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
         excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH

diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -227,6 +227,7 @@ def __init__(self):
                 self.COLLEGE_ATTENDANCE_FIELD,
                 self.COLLEGE_NON_ATTENDANCE_FIELD,
                 self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
+                field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
             ]
             + self.RE_OUTPUT_FIELDS
             + [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
@@ -503,6 +504,13 @@ def transform(self) -> None:
             }
         )
 
+        # We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
+        # This allows us to see which tracts have an imputed income. 
+        df[field_names.IMPUTED_INCOME_FLAG_FIELD_NAME] = (
+            df[field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD].notna()
+            & df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
+        )
+
         # Strip columns and save results to self.
         self.df = df[self.COLUMNS_TO_KEEP]
 

diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@@ -92,12 +92,17 @@ def calculate_income_measures(
     )
 
     # Iterate through the dataframe to impute in place
+    ## TODO: We should probably convert this to a spatial join now that we are doing >1 imputation and it's taking a lot
+    ## of time, but thinking through how to do this while maintaining the masking will take some time. I think the best
+    ## way would be to (1) spatial join to all neighbors, and then (2) iterate to take the "smallest" set of neighbors...
+    ## but haven't implemented it yet.
     for index, row in geo_df.iterrows():
         if row[geoid_field] in tract_list:
             neighbor_mask = _get_neighbor_mask(geo_df, row)
             county_mask = _get_fips_mask(
                 geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
             )
+            ## TODO: Did CEQ decide to cut this?
             state_mask = _get_fips_mask(
                 geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
             )

diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -1,7 +1,7 @@
 # Suffixes
 PERCENTILE_FIELD_SUFFIX = " (percentile)"
 ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
-ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
+ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
 ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
 
 # Geographic field names
@@ -12,6 +12,9 @@
 # Score file field names
 # Definition M fields
 SCORE_M = "Definition M"
+FINAL_SCORE_N_BOOLEAN = (
+    "Definition M community, including adjacency index tracts"
+)
 SCORE_M_COMMUNITIES = "Definition M (communities)"
 M_CLIMATE = "Climate Factor (Definition M)"
 M_ENERGY = "Energy Factor (Definition M)"
@@ -67,6 +70,9 @@
 
 # this is what gets used in the score
 POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
+IMPUTED_INCOME_FLAG_FIELD_NAME = (
+    "Income data has been estimated based on neighbor income"
+)
 POVERTY_LESS_THAN_150_FPL_FIELD = (
     "Percent of individuals < 150% Federal Poverty Line"
 )

diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@@ -385,8 +385,10 @@ def _housing_factor(self) -> bool:
 
         # Kitchen / plumbing
         self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD] = (
-            self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD 
-                    + field_names.PERCENTILE_FIELD_SUFFIX]
+            self.df[
+                field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
+                + field_names.PERCENTILE_FIELD_SUFFIX
+            ]
             >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
         )
 
@@ -971,15 +973,25 @@ def _mark_donut_hole_tracts(self) -> pd.DataFrame:
             >= self.SCORE_THRESHOLD_DONUT
         )
 
-        # This should be the "final list" of Score Narwhal communities, meaning that we would
-        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
+        # This constructs the boolean for whether it's a donut hole community
+        # This can also be true when the tract itself is a DAC on its own
         self.df[
             field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
         ] = (
             self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
             & self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
         )
 
+        # This should be the "final list" of Score Narwhal communities, meaning that we would
+        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
+        self.df[field_names.FINAL_SCORE_N_BOOLEAN] = (
+            self.df[field_names.SCORE_N_COMMUNITIES]
+            | self.df[
+                field_names.SCORE_N_COMMUNITIES
+                + field_names.ADJACENT_MEAN_SUFFIX
+            ]
+        )
+
     def add_columns(self) -> pd.DataFrame:
         logger.info("Adding Score Narhwal")