From a61ed6efe0328012d9aa3c0900003a906e33b05d Mon Sep 17 00:00:00 2001 From: matt bowen Date: Mon, 12 Sep 2022 16:55:31 -0400 Subject: [PATCH] Update tests for new multi-CSV but (#1848) Lucas updated the CDC life expectancy data to handle a bug where two states are missing from the US Overall download. Since virtually none of our other ETL classes download multiple CSVs directly like this, it required a pretty invasive new mocking strategy. --- .../etl/sources/cdc_life_expectancy/etl.py | 11 +-- .../cdc_life_expectancy/data/transform.csv | 6 +- .../sources/cdc_life_expectancy/test_etl.py | 70 ++++++++++++++++++- .../tests/sources/cdc_places/test_etl.py | 4 -- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 0221c9df7..73a4959cb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -16,9 +16,12 @@ class CDCLifeExpectancy(ExtractTransformLoad): GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False + NAME = "cdc_life_expectancy" + USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" - LOAD_YAML_CONFIG: bool = True + LOAD_YAML_CONFIG: bool = False LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" + INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID" STATES_MISSING_FROM_USA_FILE = ["23", "55"] @@ -71,8 +74,7 @@ def extract(self) -> None: all_usa_raw_df = self._download_and_prep_data( file_url=self.USA_FILE_URL, download_file_name=self.get_tmp_path() - / "cdc_life_expectancy" - / "usa.csv", + / "US_A.CSV", ) # Check which states are missing @@ -93,7 +95,6 @@ def extract(self) -> None: maine_raw_df = self._download_and_prep_data( file_url=self.MAINE_FILE_URL, download_file_name=self.get_tmp_path() - / "cdc_life_expectancy" / "maine.csv", ) @@ -101,7 +102,6 @@ def extract(self) -> None: wisconsin_raw_df = self._download_and_prep_data( file_url=self.WISCONSIN_FILE_URL, download_file_name=self.get_tmp_path() - / "cdc_life_expectancy" / "wisconsin.csv", ) @@ -137,6 +137,7 @@ def transform(self) -> None: self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, } ) + def load(self) -> None: logger.info("Saving CDC Life Expectancy CSV") diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv index 36d2214e5..23bfc72f3 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv @@ -10,7 +10,7 @@ GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridge 15009030100,15,9,30100,77.2000000000,1.8736000000,3 15009030402,15,9,30402,83.5000000000,1.8267000000,3 15009030800,15,9,30800,82.2000000000,1.6251000000,3 -06027000800,6,7,40500,99.1000000000,3.1415000000,3 -06069000802,6,1,20100,99.1000000000,3.1415000000,3 -06061021322,6,7,40300,99.1000000000,3.1415000000,3 +06027000800,06,7,40500,99.1000000000,3.1415000000,3 +06069000802,06,1,20100,99.1000000000,3.1415000000,3 +06061021322,06,7,40300,99.1000000000,3.1415000000,3 15009030201,15,9,30201,99.1000000000,3.1415000000,3 diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py index 6e39217ff..cd6eedf24 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py @@ -1,6 +1,8 @@ # pylint: disable=protected-access import pathlib - +from unittest import mock +import requests +from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy from data_pipeline.tests.sources.example.test_etl import TestETL from data_pipeline.utils import get_module_logger @@ -21,8 +23,8 @@ class TestCDCLifeExpectency(TestETL): _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data" _SAMPLE_DATA_FILE_NAME = "US_A.CSV" - _SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV" - _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy" + _SAMPLE_DATA_ZIP_FILE_NAME = None + _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectanc" _EXTRACT_CSV_FILE_NAME = "extract.csv" def setup_method(self, _method, filename=__file__): @@ -32,6 +34,68 @@ def setup_method(self, _method, filename=__file__): """ super().setup_method(_method=_method, filename=filename) + def _setup_etl_instance_and_run_extract( + self, mock_etl, mock_paths + ) -> ExtractTransformLoad: + """Method to setup an ETL instance with proper upstream mocks to run extract. + This must be re-implemented in every child class. + + This method can be used by multiple tests that need to run the same fixtures + that need these same mocks. + + In order to re-implement this method, usually it will involve a + decent amount of work to monkeypatch `requests` or another method that's + used to retrieve data in order to force that method to retrieve the fixture + data. A basic version of that patching is included here for classes that can use it. + """ + + with mock.patch( + "data_pipeline.utils.requests" + ) as requests_mock, mock.patch( + "data_pipeline.etl.score.etl_utils.get_state_fips_codes" + ) as mock_get_state_fips_codes: + tmp_path = mock_paths[1] + if self._SAMPLE_DATA_ZIP_FILE_NAME is not None: + zip_file_fixture_src = ( + self._DATA_DIRECTORY_FOR_TEST + / self._SAMPLE_DATA_ZIP_FILE_NAME + ) + + # Create mock response. + with open(zip_file_fixture_src, mode="rb") as file: + file_contents = file.read() + else: + with open( + self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_FILE_NAME, + "rb", + ) as file: + file_contents = file.read() + + def fake_get(url, *args, **kwargs): + response_mock = requests.Response() + response_mock.status_code = 200 + # pylint: disable=protected-access + # Return text fixture: + if url.endswith("US_A.CSV"): + response_mock._content = file_contents + else: + response_mock._content = b"Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag" + return response_mock + + requests_mock.get = fake_get + mock_get_state_fips_codes.return_value = [ + x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS + ] + # Instantiate the ETL class. + etl = self._get_instance_of_etl_class() + + # Monkey-patch the temporary directory to the one used in the test + etl.TMP_PATH = tmp_path + + # Run the extract method. + etl.extract() + return etl + def test_init(self, mock_etl, mock_paths): etl = self._ETL_CLASS() data_path, _ = mock_paths diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py index 9ddadce7a..dbffdfa5b 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py @@ -23,7 +23,3 @@ def test_sample_data_exists(self): As per conversation with Jorge, here we can *just* test that the zip file exists. """ assert (self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_FILE_NAME).exists() - - def test_extract_unzips_base(self, mock_etl, mock_paths): - # We don't have a zip, so this test doesn't make sense - pass \ No newline at end of file