Skip to content

Commit

Permalink
Update tests for new multi-CSV but (#1848)
Browse files Browse the repository at this point in the history
Lucas updated the CDC life expectancy data to handle a bug where two
states are missing from the US Overall download. Since virtually none of
our other ETL classes download multiple CSVs directly like this, it
required a pretty invasive new mocking strategy.
  • Loading branch information
mattbowen-usds committed Sep 12, 2022
1 parent eec4db2 commit a61ed6e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ class CDCLifeExpectancy(ExtractTransformLoad):
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False

NAME = "cdc_life_expectancy"

USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
LOAD_YAML_CONFIG: bool = True
LOAD_YAML_CONFIG: bool = False
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"

STATES_MISSING_FROM_USA_FILE = ["23", "55"]

Expand Down Expand Up @@ -71,8 +74,7 @@ def extract(self) -> None:
all_usa_raw_df = self._download_and_prep_data(
file_url=self.USA_FILE_URL,
download_file_name=self.get_tmp_path()
/ "cdc_life_expectancy"
/ "usa.csv",
/ "US_A.CSV",
)

# Check which states are missing
Expand All @@ -93,15 +95,13 @@ def extract(self) -> None:
maine_raw_df = self._download_and_prep_data(
file_url=self.MAINE_FILE_URL,
download_file_name=self.get_tmp_path()
/ "cdc_life_expectancy"
/ "maine.csv",
)

logger.info("Downloading data for Wisconsin")
wisconsin_raw_df = self._download_and_prep_data(
file_url=self.WISCONSIN_FILE_URL,
download_file_name=self.get_tmp_path()
/ "cdc_life_expectancy"
/ "wisconsin.csv",
)

Expand Down Expand Up @@ -137,6 +137,7 @@ def transform(self) -> None:
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
}
)

def load(self) -> None:
logger.info("Saving CDC Life Expectancy CSV")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridge
15009030100,15,9,30100,77.2000000000,1.8736000000,3
15009030402,15,9,30402,83.5000000000,1.8267000000,3
15009030800,15,9,30800,82.2000000000,1.6251000000,3
06027000800,6,7,40500,99.1000000000,3.1415000000,3
06069000802,6,1,20100,99.1000000000,3.1415000000,3
06061021322,6,7,40300,99.1000000000,3.1415000000,3
06027000800,06,7,40500,99.1000000000,3.1415000000,3
06069000802,06,1,20100,99.1000000000,3.1415000000,3
06061021322,06,7,40300,99.1000000000,3.1415000000,3
15009030201,15,9,30201,99.1000000000,3.1415000000,3
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# pylint: disable=protected-access
import pathlib

from unittest import mock
import requests
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.utils import get_module_logger
Expand All @@ -21,8 +23,8 @@ class TestCDCLifeExpectency(TestETL):

_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "US_A.CSV"
_SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV"
_EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy"
_SAMPLE_DATA_ZIP_FILE_NAME = None
_EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectanc"
_EXTRACT_CSV_FILE_NAME = "extract.csv"

def setup_method(self, _method, filename=__file__):
Expand All @@ -32,6 +34,68 @@ def setup_method(self, _method, filename=__file__):
"""
super().setup_method(_method=_method, filename=filename)

def _setup_etl_instance_and_run_extract(
self, mock_etl, mock_paths
) -> ExtractTransformLoad:
"""Method to setup an ETL instance with proper upstream mocks to run extract.
This must be re-implemented in every child class.
This method can be used by multiple tests that need to run the same fixtures
that need these same mocks.
In order to re-implement this method, usually it will involve a
decent amount of work to monkeypatch `requests` or another method that's
used to retrieve data in order to force that method to retrieve the fixture
data. A basic version of that patching is included here for classes that can use it.
"""

with mock.patch(
"data_pipeline.utils.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST
/ self._SAMPLE_DATA_ZIP_FILE_NAME
)

# Create mock response.
with open(zip_file_fixture_src, mode="rb") as file:
file_contents = file.read()
else:
with open(
self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_FILE_NAME,
"rb",
) as file:
file_contents = file.read()

def fake_get(url, *args, **kwargs):
response_mock = requests.Response()
response_mock.status_code = 200
# pylint: disable=protected-access
# Return text fixture:
if url.endswith("US_A.CSV"):
response_mock._content = file_contents
else:
response_mock._content = b"Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag"
return response_mock

requests_mock.get = fake_get
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()

# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path

# Run the extract method.
etl.extract()
return etl

def test_init(self, mock_etl, mock_paths):
etl = self._ETL_CLASS()
data_path, _ = mock_paths
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,3 @@ def test_sample_data_exists(self):
As per conversation with Jorge, here we can *just* test that the zip file exists.
"""
assert (self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_FILE_NAME).exists()

def test_extract_unzips_base(self, mock_etl, mock_paths):
# We don't have a zip, so this test doesn't make sense
pass

0 comments on commit a61ed6e

Please sign in to comment.