Adding first street foundation data (#1823)

Adding FSF flood and wildfire risk datasets to the score.
usds · Aug 17, 2022 · 5e378ae · 5e378ae
1 parent ebac552
commit 5e378ae
Show file tree

Hide file tree

Showing 21 changed files with 430 additions and 82 deletions.
diff --git a/data/data-pipeline/data_pipeline/content/config/csv.yml b/data/data-pipeline/data_pipeline/content/config/csv.yml
@@ -272,3 +272,21 @@ fields:
   - score_name: Leaky underground storage tanks
     label: Leaky underground storage tanks
     format: float
+  - score_name: Share of properties at risk of flood in 30 years
+    label: Share of properties at risk of flood in 30 years
+    format: float
+  - score_name: Share of properties at risk of fire in 30 years
+    label: Share of properties at risk of fire in 30 years
+    format: float
+  - score_name: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years and is low income?
+    label: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years and is low income?
+    format: bool
+  - score_name: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years and is low income?
+    label: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years and is low income?
+    format: bool
+  - score_name: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years
+    label: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years
+    format: bool
+  - score_name: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years
+    label: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years
+    format: bool
diff --git a/data/data-pipeline/data_pipeline/content/config/excel.yml b/data/data-pipeline/data_pipeline/content/config/excel.yml
@@ -276,3 +276,21 @@ sheets:
       - score_name: Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?
         label: Greater than or equal to the 90th percentile for low median household income as a percent of area median income and has low HS education in 2009 (island areas)?
         format: bool
+      - score_name: Share of properties at risk of flood in 30 years
+        label: Share of properties at risk of flood in 30 years
+        format: float
+      - score_name: Share of properties at risk of fire in 30 years
+        label: Share of properties at risk of fire in 30 years
+        format: float
+      - score_name: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years and is low income?
+        label: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years and is low income?
+        format: bool
+      - score_name: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years and is low income?
+        label: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years and is low income?
+        format: bool
+      - score_name: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years
+        label: Greater than or equal to the 90th percentile for share of properties at risk of flood in 30 years
+        format: bool
+      - score_name: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years
+        label: Greater than or equal to the 90th percentile for share of properties at risk of fire in 30 years
+        format: bool
diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py
@@ -34,6 +34,16 @@
         "module_dir": "mapping_for_ej",
         "class_name": "MappingForEJETL",
     },
+    {
+        "name": "fsf_flood_risk",
+        "module_dir": "fsf_flood_risk",
+        "class_name": "FloodRiskETL",
+    },
+    {
+        "name": "fsf_wildfire_risk",
+        "module_dir": "fsf_wildfire_risk",
+        "class_name": "WildfireRiskETL",
+    },
     {
         "name": "ejscreen",
         "module_dir": "ejscreen",

diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@@ -157,6 +157,88 @@ datasets:
         include_in_tiles: true
         include_in_downloadable_files: true
 
+  - long_name: "First Street Foundation Flood Risk"
+    short_name: "FSF Flood Risk"
+    module_name: fsf_flood_risk
+    input_geoid_tract_field_name: "GEOID"
+    load_fields:
+      - short_name: "flood_eligible_properties"
+        df_field_name: "COUNT_PROPERTIES"
+        long_name: "Count of properties eligible for flood risk calculation within tract (floor of 250)" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "flood_risk_properties_today"
+        df_field_name: "PROPERTIES_AT_RISK_FROM_FLOODING_TODAY"
+        long_name: "Count of properties at risk of flood today" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "flood_risk_properties_30yrs"
+        df_field_name: "PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS"
+        long_name: "Count of properties at risk of flood in 30 years" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "flood_risk_share_today"
+        df_field_name: "SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY"
+        long_name: "Share of properties at risk of flood today" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: true
+      - short_name: "flood_risk_share_30yrs"
+        df_field_name: "SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS"
+        long_name: "Share of properties at risk of flood in 30 years" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: true
+
+  - long_name: "First Street Foundation Wildfire Risk"
+    short_name: "FSF Wildfire Risk"
+    module_name: fsf_wildfire_risk
+    input_geoid_tract_field_name: "GEOID"
+    load_fields:
+      - short_name: "fire_eligible_properties"
+        df_field_name: "COUNT_PROPERTIES"
+        long_name: "Count of properties eligible for wildfire risk calculation within tract (floor of 250)" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "fire_risk_properties_today"
+        df_field_name: "PROPERTIES_AT_RISK_FROM_FIRE_TODAY"
+        long_name: "Count of properties at risk of wildfire today" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "fire_risk_properties_30yrs"
+        df_field_name: "PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS"
+        long_name: "Count of properties at risk of wildfire in 30 years" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "fire_risk_share_today"
+        df_field_name: "SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY"
+        long_name: "Share of properties at risk of fire today" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: true
+      - short_name: "fire_risk_share_30yrs"
+        df_field_name: "SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS"
+        long_name: "Share of properties at risk of fire in 30 years" 
+        field_type: float
+        include_in_tiles: false
+        include_in_downloadable_files: true
+        create_percentile: true
+
   - long_name: "DOT Travel Disadvantage Index"
     short_name: "DOT"
     module_name: "travel_composite"

diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -293,12 +293,18 @@
     field_names.WORKFORCE_THRESHOLD_EXCEEDED: "M_WKFC_EOMI",
     # These are the booleans for socioeconomic indicators
     ## this measures low income boolean
-    field_names.FPL_200_SERIES: "FPL200S",
+    field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED: "FPL200S",
     ## Low high school for t&wd
     field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
     field_names.DOT_BURDEN_PCTILE_THRESHOLD: "TD_ET",
     field_names.DOT_TRAVEL_BURDEN_FIELD
-    + field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS"
+    + field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS",
+    field_names.FUTURE_FLOOD_RISK_FIELD
+    + field_names.PERCENTILE_FIELD_SUFFIX: "FLD_PFS",
+    field_names.FUTURE_WILDFIRE_RISK_FIELD
+    + field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
+    field_names.HIGH_FUTURE_FLOOD_RISK_FIELD: "FLD_ET",
+    field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD: "WF_ET",
     ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
     ## FPL_200 (there is no higher ed in narwhal)
 }
@@ -352,4 +358,7 @@
     field_names.COLLEGE_NON_ATTENDANCE_FIELD,
     field_names.COLLEGE_ATTENDANCE_FIELD,
     field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
+    field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
+    field_names.FUTURE_WILDFIRE_RISK_FIELD
+    + field_names.PERCENTILE_FIELD_SUFFIX,
 ]
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -11,6 +11,10 @@
 from data_pipeline.etl.sources.dot_travel_composite.etl import (
     TravelCompositeETL,
 )
+from data_pipeline.etl.sources.fsf_flood_risk.etl import (
+    FloodRiskETL,
+)
+from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.score import field_names
 from data_pipeline.etl.score import constants
@@ -41,6 +45,8 @@ def __init__(self):
         self.child_opportunity_index_df: pd.DataFrame
         self.hrs_df: pd.DataFrame
         self.dot_travel_disadvantage_df: pd.DataFrame
+        self.fsf_flood_df: pd.DataFrame
+        self.fsf_fire_df: pd.DataFrame
 
     def extract(self) -> None:
         logger.info("Loading data sets from disk.")
@@ -122,6 +128,12 @@ def extract(self) -> None:
         # Load DOT Travel Disadvantage
         self.dot_travel_disadvantage_df = TravelCompositeETL.get_data_frame()
 
+        # Load fire risk data
+        self.fsf_fire_df = WildfireRiskETL.get_data_frame()
+
+        # Load flood risk data
+        self.fsf_flood_df = FloodRiskETL.get_data_frame()
+
         # Load GeoCorr Urban Rural Map
         geocorr_urban_rural_csv = (
             constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@@ -342,6 +354,8 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             self.child_opportunity_index_df,
             self.hrs_df,
             self.dot_travel_disadvantage_df,
+            self.fsf_flood_df,
+            self.fsf_fire_df,
         ]
 
         # Sanity check each data frame before merging.
@@ -426,6 +440,8 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             field_names.UST_FIELD,
             field_names.DOT_TRAVEL_BURDEN_FIELD,
             field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
+            field_names.FUTURE_FLOOD_RISK_FIELD,
+            field_names.FUTURE_WILDFIRE_RISK_FIELD,
             field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
         ]
 

diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/README.md b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/README.md
@@ -0,0 +1,3 @@
+# FSF flood risk data
+
+Flood risk computed as 1 in 100 year flood zone
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/__init__.py
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
@@ -0,0 +1,93 @@
+# pylint: disable=unsubscriptable-object
+# pylint: disable=unsupported-assignment-operation
+
+import pandas as pd
+from data_pipeline.config import settings
+
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class FloodRiskETL(ExtractTransformLoad):
+    """ETL class for the First Street Foundation flood risk dataset"""
+
+    NAME = "fsf_flood_risk"
+    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
+    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
+
+    # Output score variables (values set on datasets.yml) for linting purposes
+    COUNT_PROPERTIES: str
+    PROPERTIES_AT_RISK_FROM_FLOODING_TODAY: str
+    PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
+    SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY: str
+    SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
+
+    def __init__(self):
+        # define the full path for the input CSV file
+        self.INPUT_CSV = (
+            self.get_tmp_path() / "fsf_flood" / "flood_tract_2010.csv"
+        )
+
+        # this is the main dataframe
+        self.df: pd.DataFrame
+
+        # Start dataset-specific vars here
+        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
+        self.COUNT_PROPERTIES_AT_RISK_TODAY = "mid_depth_100_year00"
+        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
+        self.CLIP_PROPERTIES_COUNT = 250
+
+    def transform(self) -> None:
+        """Reads the unzipped data file into memory and applies the following
+        transformations to prepare it for the load() method:
+
+        - Renames the Census Tract column to match the other datasets
+        - Calculates share of properties at risk, left-clipping number of properties at 250
+        """
+        logger.info("Transforming National Risk Index Data")
+
+        logger.info(self.COLUMNS_TO_KEEP)
+        # read in the unzipped csv data source then rename the
+        # Census Tract column for merging
+        df_fsf_flood_disagg: pd.DataFrame = pd.read_csv(
+            self.INPUT_CSV,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
+        df_fsf_flood_disagg[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood_disagg[
+            self.INPUT_GEOID_TRACT_FIELD_NAME
+        ].str.zfill(11)
+
+        # Because we have some tracts that are listed twice, we aggregate based on
+        # GEOID10_TRACT. Note that I haven't confirmed this with the FSF boys -- to do!
+        df_fsf_flood = (
+            df_fsf_flood_disagg.groupby(self.GEOID_TRACT_FIELD_NAME)
+            .sum()
+            .reset_index()
+        )
+
+        df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
+            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
+        ].clip(lower=self.CLIP_PROPERTIES_COUNT)
+
+        df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
+            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / df_fsf_flood[self.COUNT_PROPERTIES]
+        )
+        df_fsf_flood[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
+        ] = (
+            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / df_fsf_flood[self.COUNT_PROPERTIES]
+        )
+
+        # Assign the final df to the class' output_df for the load method with rename
+        self.output_df = df_fsf_flood.rename(
+            columns={
+                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
+                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
+            }
+        )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/README.md b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/README.md
@@ -0,0 +1,3 @@
+# FSF wildfire risk data
+
+Fire risk computed as >= 0.003 burn risk probability
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# FSF flood risk data

		Flood risk computed as 1 in 100 year flood zone
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# FSF wildfire risk data

		Fire risk computed as >= 0.003 burn risk probability