Skip to content

Commit

Permalink
Add historic redlining tests (#1848)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Sep 14, 2022
1 parent f455a98 commit ecdd761
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import pandas as pd

from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings

logger = get_module_logger(__name__)


class HistoricRedliningETL(ExtractTransformLoad):
NAME = "historic_redlining"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT

def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
self.HISTORIC_REDLINING_URL = (
Expand All @@ -25,13 +28,6 @@ def __init__(self):
]
self.df: pd.DataFrame

def extract(self) -> None:
logger.info("Downloading Historic Redlining Data")
super().extract(
self.HISTORIC_REDLINING_URL,
self.get_tmp_path(),
)

def transform(self) -> None:
logger.info("Transforming Historic Redlining Data")
# this is obviously temporary
Expand All @@ -57,16 +53,4 @@ def transform(self) -> None:
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
)

self.df = historic_redlining_data

def load(self) -> None:
logger.info("Saving Historic Redlining CSV")
# write selected states csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
self.CSV_PATH / "usa.csv", index=False
)

def validate(self) -> None:
logger.info("Validating Historic Redlining Data")
pass
self.output_df = historic_redlining_data
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import geopandas as gpd
import pathlib
import geopandas as gpd
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.etl.sources.dot_travel_composite.etl import (
TravelCompositeETL,
Expand Down Expand Up @@ -31,4 +31,4 @@ def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert tmp_df.shape[0] >= 15
assert tmp_df.shape[1] >= 86
assert tmp_df.shape[1] >= 86
Empty file.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10_TRACT,Tract-level redlining score
06027000800,3.3000000000
06061021322,3.9900000000
06069000802,3.7800000000
15001021010,4.0000000000
15001021101,4.0000000000
15001021402,3.8600000000
15001021800,4.0000000000
15003010201,3.9600000000
15007040603,3.9700000000
15007040604,3.9400000000
15007040700,3.2000000000
15009030100,3.7700000000
15009030201,3.2300000000
15009030402,3.0000000000
15009030800,3.4000000000
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10,CBSA_NAME,CBSA_NUM,EQINTERVAL2010,Tract-level redlining score,GEOID10_TRACT,Tract-level redlining score meets or exceeds 3.25,Tract-level redlining score meets or exceeds 3.5,Tract-level redlining score meets or exceeds 3.75
6027000800,"Birmingham-Hoover, AL",13820,4,3.3000000000,06027000800,True,False,False
6061021322,"Birmingham-Hoover, AL",13820,4,3.9900000000,06061021322,True,True,True
6069000802,"Birmingham-Hoover, AL",13820,4,3.7800000000,06069000802,True,True,True
15001021010,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021010,True,True,True
15001021101,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021101,True,True,True
15001021402,"Birmingham-Hoover, AL",13820,4,3.8600000000,15001021402,True,True,True
15001021800,"Birmingham-Hoover, AL",13820,4,4.0000000000,15001021800,True,True,True
15003010201,"Birmingham-Hoover, AL",13820,4,3.9600000000,15003010201,True,True,True
15007040603,"Birmingham-Hoover, AL",13820,4,3.9700000000,15007040603,True,True,True
15007040604,"Birmingham-Hoover, AL",13820,4,3.9400000000,15007040604,True,True,True
15007040700,"Birmingham-Hoover, AL",13820,3,3.2000000000,15007040700,False,False,False
15009030100,"Birmingham-Hoover, AL",13820,4,3.7700000000,15009030100,True,True,True
15009030201,"Birmingham-Hoover, AL",13820,3,3.2300000000,15009030201,False,False,False
15009030402,"Birmingham-Hoover, AL",13820,3,3.0000000000,15009030402,False,False,False
15009030800,"Birmingham-Hoover, AL",13820,4,3.4000000000,15009030800,True,False,False
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pathlib
import pandas as pd
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.etl.sources.historic_redlining.etl import (
HistoricRedliningETL,
)


class TestHistoricRedliningETL(TestETL):
_ETL_CLASS = HistoricRedliningETL

_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "HRS_2010.xlsx"
_SAMPLE_DATA_ZIP_FILE_NAME = "HRS_2010.zip"
_EXTRACT_TMP_FOLDER_NAME = "HistoricRedliningETL"

def setup_method(self, _method, filename=__file__):
"""Invoke `setup_method` from Parent, but using the current file name.
This code can be copied identically between all child classes.
"""
super().setup_method(_method=_method, filename=filename)

def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
tmp_df = pd.read_excel(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert tmp_df.shape == (15, 5)

0 comments on commit ecdd761

Please sign in to comment.