usds · mattbowen-usds · Sep 20, 2022 · Sep 15, 2022 · Sep 15, 2022 · Sep 15, 2022
diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md
@@ -234,6 +234,15 @@ If you want to run tile generation, please install TippeCanoe [following these i
 - We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download.
 - Install Poetry requirements with `poetry install`
 
+### Running tox 
+
+Our full test and check suite is run using tox. This can be run using commands such 
+as `poetry run tox`.
+
+Each run can take a while to build the whole environment. If you'd like to save time,
+you can use the previously built environment by running `poetry run tox -e lint` 
+which will drastically speed up the process.
+
 ### The Application entrypoint
 
 After installing the poetry dependencies, you can see a list of commands with the following steps:

diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py
@@ -186,6 +186,12 @@
         "class_name": "AbandonedMineETL",
         "is_memory_intensive": True,
     },
+    {
+        "name": "tribal_overlap",
+        "module_dir": "tribal_overlap",
+        "class_name": "TribalOverlapETL",
+        "is_memory_intensive": True,
+    },
 ]
 
 CENSUS_INFO = {

diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py
@@ -106,6 +106,8 @@ def etl_runner(dataset_to_run: str = None) -> None:
                 # Otherwise, the exceptions are silently ignored.
                 fut.result()
 
+    # Note: these high-memory datasets also usually require the Census geojson to be
+    # generated, and one of them requires the Tribal geojson to be generated.
     if high_memory_datasets:
         logger.info("Running high-memory jobs")
         for dataset in high_memory_datasets:

diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@@ -290,6 +290,32 @@ datasets:
         include_in_tiles: true
         include_in_downloadable_files: true
         create_percentile: true
+  - long_name: "Overlap between Census tract boundaries and Tribal area boundaries."
+    short_name: "tribal_overlap"
+    module_name: "tribal_overlap"
+    input_geoid_tract_field_name: "GEOID10_TRACT"
+    load_fields:
+      - short_name: "tribal_count"
+        df_field_name: "COUNT_OF_TRIBAL_AREAS_IN_TRACT"
+        long_name: "Number of Tribal areas within Census tract"
+        field_type: int64
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "tribal_percent"
+        df_field_name: "PERCENT_OF_TRIBAL_AREA_IN_TRACT"
+        long_name: "Percent of the Census tract that is within Tribal areas"
+        field_type: float
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: false
+        number_of_decimals_in_output: 6
+      - short_name: "tribal_names"
+        df_field_name: "NAMES_OF_TRIBAL_AREAS_IN_TRACT"
+        long_name: "Names of Tribal areas within Census tract"
+        field_type: string
+        include_in_tiles: true
+        include_in_downloadable_files: true
   - long_name: "CDC Life Expeectancy"
     short_name: "cdc_life_expectancy"
     module_name: "cdc_life_expectancy"
@@ -302,5 +328,4 @@ datasets:
        include_in_tiles: false
        include_in_downloadable_files: true
        create_percentile: false
-       create_reverse_percentile: true
-
+       create_reverse_percentile: true
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -15,6 +15,7 @@
     FloodRiskETL,
 )
 from data_pipeline.etl.sources.eamlis.etl import AbandonedMineETL
+from data_pipeline.etl.sources.tribal_overlap.etl import TribalOverlapETL
 from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL
 from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
@@ -52,6 +53,7 @@ def __init__(self):
         self.nature_deprived_df: pd.DataFrame
         self.eamlis_df: pd.DataFrame
         self.fuds_df: pd.DataFrame
+        self.tribal_overlap_df: pd.DataFrame
 
     def extract(self) -> None:
         logger.info("Loading data sets from disk.")
@@ -148,6 +150,9 @@ def extract(self) -> None:
         # Load FUDS dataset
         self.fuds_df = USArmyFUDS.get_data_frame()
 
+        # Load Tribal overlap dataset
+        self.tribal_overlap_df = TribalOverlapETL.get_data_frame()
+
         # Load GeoCorr Urban Rural Map
         geocorr_urban_rural_csv = (
             constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@@ -359,6 +364,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             self.nature_deprived_df,
             self.eamlis_df,
             self.fuds_df,
+            self.tribal_overlap_df
         ]
 
         # Sanity check each data frame before merging.
@@ -469,12 +475,15 @@ def _prepare_initial_df(self) -> pd.DataFrame:
             field_names.PERCENT_AGE_UNDER_10,
             field_names.PERCENT_AGE_10_TO_64,
             field_names.PERCENT_AGE_OVER_64,
+            field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
+            field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
         ]
 
         non_numeric_columns = [
             self.GEOID_TRACT_FIELD_NAME,
             field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD,
             field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
+            field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
         ]
 
         boolean_columns = [

diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@@ -229,3 +229,25 @@ def test_compare_to_list_of_expected_state_fips_codes():
         continental_us_expected=False,
         alaska_and_hawaii_expected=False,
     )
+
+    # Missing Hawaii but not Alaska
+    fips_codes_test_5 = [x for x in fips_codes_test_1 if x not in ["15"]]
+
+    # Should raise error because both Hawaii and Alaska are expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_5,
+            alaska_and_hawaii_expected=True,
+        )
+    partial_expected_error_message = (
+        "FIPS state codes expected that are not present in the data:\n"
+        "['15']\n"
+    )
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # Should work as expected
+    compare_to_list_of_expected_state_fips_codes(
+        actual_state_fips_codes=fips_codes_test_5,
+        alaska_and_hawaii_expected=True,
+        additional_fips_codes_not_expected=["15"],
+    )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py b/data/data-pipeline/data_pipeline/etl/sources/geo_utils.py
@@ -4,6 +4,7 @@
 from typing import Optional
 from functools import lru_cache
 import geopandas as gpd
+from data_pipeline.etl.sources.tribal.etl import TribalETL
 from data_pipeline.utils import get_module_logger
 from .census.etl import CensusETL
 
@@ -18,38 +19,66 @@ def get_tract_geojson(
     GEOJSON_PATH = _tract_data_path
     if GEOJSON_PATH is None:
         GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH
-        if not GEOJSON_PATH.exists():
-            logger.debug("Census data has not been computed, running")
-            census_etl = CensusETL()
-            census_etl.extract()
-            census_etl.transform()
-            census_etl.load()
-        else:
-            logger.debug("Loading existing tract geojson")
-    tract_data = gpd.read_file(GEOJSON_PATH, include_fields=["GEOID10"])
-    tract_data.rename(columns={"GEOID10": "GEOID10_TRACT"}, inplace=True)
+    if not GEOJSON_PATH.exists():
+        logger.debug("Census data has not been computed, running")
+        census_etl = CensusETL()
+        census_etl.extract()
+        census_etl.transform()
+        census_etl.load()
+    tract_data = gpd.read_file(
+        GEOJSON_PATH,
+        include_fields=["GEOID10"],
+    )
+    tract_data = tract_data.rename(
+        columns={"GEOID10": "GEOID10_TRACT"}, errors="raise"
+    )
     return tract_data
 
 
+@lru_cache()
+def get_tribal_geojson(
+    _tribal_data_path: Optional[Path] = None,
+) -> gpd.GeoDataFrame:
+    logger.info("Loading Tribal geometry data from Tribal ETL")
+    GEOJSON_PATH = _tribal_data_path
+    if GEOJSON_PATH is None:
+        GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH
+    if not GEOJSON_PATH.exists():
+        logger.debug("Tribal data has not been computed, running")
+        tribal_etl = TribalETL()
+        tribal_etl.extract()
+        tribal_etl.transform()
+        tribal_etl.load()
+    tribal_data = gpd.read_file(
+        GEOJSON_PATH,
+    )
+    return tribal_data
+
+
 def add_tracts_for_geometries(
-    df: gpd.GeoDataFrame, _tract_data_path: Optional[Path] = None
+    df: gpd.GeoDataFrame, tract_data: Optional[gpd.GeoDataFrame] = None
 ) -> gpd.GeoDataFrame:
     """Adds tract-geoids to dataframe df that contains spatial geometries
 
     Depends on CensusETL for the geodata to do its conversion
 
     Args:
         df (GeoDataFrame): a geopandas GeoDataFrame with a point geometry column
-        _tract_data_path (Path): an override to directly pass a GEOJSON file of
-                              tracts->Geometries, to simplify testing.
+        tract_data (GeoDataFrame): optional override to directly pass a
+            geodataframe of the tract boundaries. Also helps simplify testing.
 
     Returns:
         GeoDataFrame: the above dataframe, with an additional GEOID10_TRACT column that
                       maps the points in DF to census tracts and a geometry column for later
                       spatial analysis
     """
     logger.debug("Appending tract data to dataframe")
-    tract_data = get_tract_geojson(_tract_data_path)
+
+    if tract_data is None:
+        tract_data = get_tract_geojson()
+    else:
+        logger.debug("Using existing tract data.")
+
     assert (
         tract_data.crs == df.crs
     ), f"Dataframe must be projected to {tract_data.crs}"

diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger, unzip_file_from_url
 
 logger = get_module_logger(__name__)
@@ -59,7 +60,10 @@ def _transform_bia_national_lar(self, tribal_geojson_path: Path) -> None:
         )
 
         bia_national_lar_df.rename(
-            columns={"LARID": "tribalId", "LARName": "landAreaName"},
+            columns={
+                "LARID": field_names.TRIBAL_ID,
+                "LARName": field_names.TRIBAL_LAND_AREA_NAME,
+            },
             inplace=True,
         )
 
@@ -87,7 +91,10 @@ def _transform_bia_aian_supplemental(
         )
 
         bia_aian_supplemental_df.rename(
-            columns={"OBJECTID": "tribalId", "Land_Area_": "landAreaName"},
+            columns={
+                "OBJECTID": field_names.TRIBAL_ID,
+                "Land_Area_": field_names.TRIBAL_LAND_AREA_NAME,
+            },
             inplace=True,
         )
 
@@ -113,7 +120,10 @@ def _transform_bia_tsa(self, tribal_geojson_path: Path) -> None:
         )
 
         bia_tsa_df.rename(
-            columns={"TSAID": "tribalId", "LARName": "landAreaName"},
+            columns={
+                "TSAID": field_names.TRIBAL_ID,
+                "LARName": field_names.TRIBAL_LAND_AREA_NAME,
+            },
             inplace=True,
         )
 
@@ -136,8 +146,8 @@ def _transform_alaska_native_villages(
 
         alaska_native_villages_df.rename(
             columns={
-                "GlobalID": "tribalId",
-                "TRIBALOFFICENAME": "landAreaName",
+                "GlobalID": field_names.TRIBAL_ID,
+                "TRIBALOFFICENAME": field_names.TRIBAL_LAND_AREA_NAME,
             },
             inplace=True,
         )

diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/README.md b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/README.md
diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/__init__.py