diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index da02beef7..31eacbe1d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -60,6 +60,7 @@ def __init__(self, data_source: str = None): field_names.GEOID_TRACT_FIELD ] self.GEOMETRY_FIELD_NAME = "geometry" + self.LAND_FIELD_NAME = "ALAND10" # We will adjust this upwards while there is some fractional value # in the score. This is a starting value. @@ -86,13 +87,22 @@ def extract(self) -> None: ) logger.info("Reading US GeoJSON (~6 minutes)") - self.geojson_usa_df = gpd.read_file( + full_geojson_usa_df = gpd.read_file( self.CENSUS_USA_GEOJSON, dtype={self.GEOID_FIELD_NAME: "string"}, - usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME], + usecols=[ + self.GEOID_FIELD_NAME, + self.GEOMETRY_FIELD_NAME, + self.LAND_FIELD_NAME, + ], low_memory=False, ) + # We only want to keep tracts to visualize that have non-0 land + self.geojson_usa_df = full_geojson_usa_df[ + full_geojson_usa_df[self.LAND_FIELD_NAME] > 0 + ] + logger.info("Reading score CSV") self.score_usa_df = pd.read_csv( self.TILE_SCORE_CSV,