-
Notifications
You must be signed in to change notification settings - Fork 55
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Issue 1900: Tribal overlap with Census tracts #1903
Changes from all commits
ea91b22
457ca7a
f2e8c46
0811c6f
5c08d35
c1a5561
35eb3b1
40b2b78
7e8c05f
73a7ae3
31e513a
e2d747a
1c47b1b
6abe12b
7aea9a2
fc15227
81e3180
05216f8
dbcaff7
06be30a
a39da14
e4abb07
d12b55e
52446b1
4ce8f6a
c2149fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,6 +106,8 @@ def etl_runner(dataset_to_run: str = None) -> None: | |
# Otherwise, the exceptions are silently ignored. | ||
fut.result() | ||
|
||
# Note: these high-memory datasets also usually require the Census geojson to be | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Helpful, thank you. |
||
# generated, and one of them requires the Tribal geojson to be generated. | ||
if high_memory_datasets: | ||
logger.info("Running high-memory jobs") | ||
for dataset in high_memory_datasets: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from typing import Optional | ||
from functools import lru_cache | ||
import geopandas as gpd | ||
from data_pipeline.etl.sources.tribal.etl import TribalETL | ||
from data_pipeline.utils import get_module_logger | ||
from .census.etl import CensusETL | ||
|
||
|
@@ -18,38 +19,66 @@ def get_tract_geojson( | |
GEOJSON_PATH = _tract_data_path | ||
if GEOJSON_PATH is None: | ||
GEOJSON_PATH = CensusETL.NATIONAL_TRACT_JSON_PATH | ||
if not GEOJSON_PATH.exists(): | ||
logger.debug("Census data has not been computed, running") | ||
census_etl = CensusETL() | ||
census_etl.extract() | ||
census_etl.transform() | ||
census_etl.load() | ||
else: | ||
logger.debug("Loading existing tract geojson") | ||
tract_data = gpd.read_file(GEOJSON_PATH, include_fields=["GEOID10"]) | ||
tract_data.rename(columns={"GEOID10": "GEOID10_TRACT"}, inplace=True) | ||
if not GEOJSON_PATH.exists(): | ||
logger.debug("Census data has not been computed, running") | ||
census_etl = CensusETL() | ||
census_etl.extract() | ||
census_etl.transform() | ||
census_etl.load() | ||
tract_data = gpd.read_file( | ||
GEOJSON_PATH, | ||
include_fields=["GEOID10"], | ||
) | ||
tract_data = tract_data.rename( | ||
columns={"GEOID10": "GEOID10_TRACT"}, errors="raise" | ||
) | ||
return tract_data | ||
|
||
|
||
@lru_cache() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am curious about the usage of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am guessing he was following the convention There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @esfoobar-usds @mattbowen-usds yes exactly I was just following conventions. Do we want to delete @lru_cache from both functions, or keep it on both? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is actually useful on get_tract_geojson, so I vote keep it. And I'm about to merge so it's stayin' |
||
def get_tribal_geojson( | ||
_tribal_data_path: Optional[Path] = None, | ||
) -> gpd.GeoDataFrame: | ||
logger.info("Loading Tribal geometry data from Tribal ETL") | ||
GEOJSON_PATH = _tribal_data_path | ||
if GEOJSON_PATH is None: | ||
GEOJSON_PATH = TribalETL().NATIONAL_TRIBAL_GEOJSON_PATH | ||
if not GEOJSON_PATH.exists(): | ||
logger.debug("Tribal data has not been computed, running") | ||
tribal_etl = TribalETL() | ||
tribal_etl.extract() | ||
tribal_etl.transform() | ||
tribal_etl.load() | ||
tribal_data = gpd.read_file( | ||
GEOJSON_PATH, | ||
) | ||
return tribal_data | ||
|
||
|
||
def add_tracts_for_geometries( | ||
df: gpd.GeoDataFrame, _tract_data_path: Optional[Path] = None | ||
df: gpd.GeoDataFrame, tract_data: Optional[gpd.GeoDataFrame] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I refactored this a bit, so I could load the Tract geojson once in the tribal overlap ETL, and not just pass around a data path. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that change --- I had been passing the path just to make my tests faster, so this is a nice way of adding flexibility. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. haha I did not expect this to end up used this way, but now I'm glad it existed and could be easily adapted to this new use case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it was awesome! loved reusable code! |
||
) -> gpd.GeoDataFrame: | ||
"""Adds tract-geoids to dataframe df that contains spatial geometries | ||
|
||
Depends on CensusETL for the geodata to do its conversion | ||
|
||
Args: | ||
df (GeoDataFrame): a geopandas GeoDataFrame with a point geometry column | ||
_tract_data_path (Path): an override to directly pass a GEOJSON file of | ||
tracts->Geometries, to simplify testing. | ||
tract_data (GeoDataFrame): optional override to directly pass a | ||
geodataframe of the tract boundaries. Also helps simplify testing. | ||
|
||
Returns: | ||
GeoDataFrame: the above dataframe, with an additional GEOID10_TRACT column that | ||
maps the points in DF to census tracts and a geometry column for later | ||
spatial analysis | ||
""" | ||
logger.debug("Appending tract data to dataframe") | ||
tract_data = get_tract_geojson(_tract_data_path) | ||
|
||
if tract_data is None: | ||
tract_data = get_tract_geojson() | ||
else: | ||
logger.debug("Using existing tract data.") | ||
|
||
assert ( | ||
tract_data.crs == df.crs | ||
), f"Dataframe must be projected to {tract_data.crs}" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import pandas as pd | ||
|
||
from data_pipeline.etl.base import ExtractTransformLoad | ||
from data_pipeline.score import field_names | ||
from data_pipeline.utils import get_module_logger, unzip_file_from_url | ||
|
||
logger = get_module_logger(__name__) | ||
|
@@ -59,7 +60,10 @@ def _transform_bia_national_lar(self, tribal_geojson_path: Path) -> None: | |
) | ||
|
||
bia_national_lar_df.rename( | ||
columns={"LARID": "tribalId", "LARName": "landAreaName"}, | ||
columns={ | ||
"LARID": field_names.TRIBAL_ID, | ||
"LARName": field_names.TRIBAL_LAND_AREA_NAME, | ||
Comment on lines
+64
to
+65
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch! |
||
}, | ||
inplace=True, | ||
) | ||
|
||
|
@@ -87,7 +91,10 @@ def _transform_bia_aian_supplemental( | |
) | ||
|
||
bia_aian_supplemental_df.rename( | ||
columns={"OBJECTID": "tribalId", "Land_Area_": "landAreaName"}, | ||
columns={ | ||
"OBJECTID": field_names.TRIBAL_ID, | ||
"Land_Area_": field_names.TRIBAL_LAND_AREA_NAME, | ||
Comment on lines
+95
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto! |
||
}, | ||
inplace=True, | ||
) | ||
|
||
|
@@ -113,7 +120,10 @@ def _transform_bia_tsa(self, tribal_geojson_path: Path) -> None: | |
) | ||
|
||
bia_tsa_df.rename( | ||
columns={"TSAID": "tribalId", "LARName": "landAreaName"}, | ||
columns={ | ||
"TSAID": field_names.TRIBAL_ID, | ||
"LARName": field_names.TRIBAL_LAND_AREA_NAME, | ||
}, | ||
inplace=True, | ||
) | ||
|
||
|
@@ -136,8 +146,8 @@ def _transform_alaska_native_villages( | |
|
||
alaska_native_villages_df.rename( | ||
columns={ | ||
"GlobalID": "tribalId", | ||
"TRIBALOFFICENAME": "landAreaName", | ||
"GlobalID": field_names.TRIBAL_ID, | ||
"TRIBALOFFICENAME": field_names.TRIBAL_LAND_AREA_NAME, | ||
}, | ||
inplace=True, | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't know this one!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a @sverchdotgov discovery!