From a61ed6efe0328012d9aa3c0900003a906e33b05d Mon Sep 17 00:00:00 2001
From: matt bowen <matthew.r.bowen@omb.eop.gov>
Date: Mon, 12 Sep 2022 16:55:31 -0400
Subject: [PATCH] Update tests for new multi-CSV but (#1848)

Lucas updated the CDC life expectancy data to handle a bug where two
states are missing from the US Overall download. Since virtually none of
our other ETL classes download multiple CSVs directly like this, it
required a pretty invasive new mocking strategy.
---
 .../etl/sources/cdc_life_expectancy/etl.py    | 11 +--
 .../cdc_life_expectancy/data/transform.csv    |  6 +-
 .../sources/cdc_life_expectancy/test_etl.py   | 70 ++++++++++++++++++-
 .../tests/sources/cdc_places/test_etl.py      |  4 --
 4 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
index 0221c9df7..73a4959cb 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@@ -16,9 +16,12 @@ class CDCLifeExpectancy(ExtractTransformLoad):
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
 
+    NAME = "cdc_life_expectancy"
+
     USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
-    LOAD_YAML_CONFIG: bool = True
+    LOAD_YAML_CONFIG: bool = False
     LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
+    INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
 
     STATES_MISSING_FROM_USA_FILE = ["23", "55"]
 
@@ -71,8 +74,7 @@ def extract(self) -> None:
         all_usa_raw_df = self._download_and_prep_data(
             file_url=self.USA_FILE_URL,
             download_file_name=self.get_tmp_path()
-            / "cdc_life_expectancy"
-            / "usa.csv",
+            / "US_A.CSV",
         )
 
         # Check which states are missing
@@ -93,7 +95,6 @@ def extract(self) -> None:
         maine_raw_df = self._download_and_prep_data(
             file_url=self.MAINE_FILE_URL,
             download_file_name=self.get_tmp_path()
-            / "cdc_life_expectancy"
             / "maine.csv",
         )
 
@@ -101,7 +102,6 @@ def extract(self) -> None:
         wisconsin_raw_df = self._download_and_prep_data(
             file_url=self.WISCONSIN_FILE_URL,
             download_file_name=self.get_tmp_path()
-            / "cdc_life_expectancy"
             / "wisconsin.csv",
         )
 
@@ -137,6 +137,7 @@ def transform(self) -> None:
                 self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
             }
         )
+
     def load(self) -> None:
         logger.info("Saving CDC Life Expectancy CSV")
 
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
index 36d2214e5..23bfc72f3 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/data/transform.csv
@@ -10,7 +10,7 @@ GEOID10_TRACT,STATE2KX,CNTY2KX,TRACT2KX,Life expectancy (years),se(e(0)),Abridge
 15009030100,15,9,30100,77.2000000000,1.8736000000,3
 15009030402,15,9,30402,83.5000000000,1.8267000000,3
 15009030800,15,9,30800,82.2000000000,1.6251000000,3
-06027000800,6,7,40500,99.1000000000,3.1415000000,3
-06069000802,6,1,20100,99.1000000000,3.1415000000,3
-06061021322,6,7,40300,99.1000000000,3.1415000000,3
+06027000800,06,7,40500,99.1000000000,3.1415000000,3
+06069000802,06,1,20100,99.1000000000,3.1415000000,3
+06061021322,06,7,40300,99.1000000000,3.1415000000,3
 15009030201,15,9,30201,99.1000000000,3.1415000000,3
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
index 6e39217ff..cd6eedf24 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@@ -1,6 +1,8 @@
 # pylint: disable=protected-access
 import pathlib
-
+from unittest import mock
+import requests
+from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.cdc_life_expectancy.etl import CDCLifeExpectancy
 from data_pipeline.tests.sources.example.test_etl import TestETL
 from data_pipeline.utils import get_module_logger
@@ -21,8 +23,8 @@ class TestCDCLifeExpectency(TestETL):
 
     _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
     _SAMPLE_DATA_FILE_NAME = "US_A.CSV"
-    _SAMPLE_DATA_ZIP_FILE_NAME = "US_A.CSV"
-    _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectancy"
+    _SAMPLE_DATA_ZIP_FILE_NAME = None
+    _EXTRACT_TMP_FOLDER_NAME = "CDCLifeExpectanc"
     _EXTRACT_CSV_FILE_NAME = "extract.csv"
 
     def setup_method(self, _method, filename=__file__):
@@ -32,6 +34,68 @@ def setup_method(self, _method, filename=__file__):
         """
         super().setup_method(_method=_method, filename=filename)
 
+    def _setup_etl_instance_and_run_extract(
+        self, mock_etl, mock_paths
+    ) -> ExtractTransformLoad:
+        """Method to setup an ETL instance with proper upstream mocks to run extract.
+        This must be re-implemented in every child class.
+
+        This method can be used by multiple tests that need to run the same fixtures
+        that need these same mocks.
+
+        In order to re-implement this method, usually it will involve a
+        decent amount of work to monkeypatch `requests` or another method that's
+        used to retrieve data in order to force that method to retrieve the fixture
+        data. A basic version of that patching is included here for classes that can use it.
+        """
+
+        with mock.patch(
+            "data_pipeline.utils.requests"
+        ) as requests_mock, mock.patch(
+            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
+        ) as mock_get_state_fips_codes:
+            tmp_path = mock_paths[1]
+            if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
+                zip_file_fixture_src = (
+                    self._DATA_DIRECTORY_FOR_TEST
+                    / self._SAMPLE_DATA_ZIP_FILE_NAME
+                )
+
+                # Create mock response.
+                with open(zip_file_fixture_src, mode="rb") as file:
+                    file_contents = file.read()
+            else:
+                with open(
+                    self._DATA_DIRECTORY_FOR_TEST / self._SAMPLE_DATA_FILE_NAME,
+                    "rb",
+                ) as file:
+                    file_contents = file.read()
+
+            def fake_get(url, *args, **kwargs):
+                response_mock = requests.Response()
+                response_mock.status_code = 200
+                # pylint: disable=protected-access
+                # Return text fixture:
+                if url.endswith("US_A.CSV"):
+                    response_mock._content = file_contents
+                else:
+                    response_mock._content = b"Tract ID,STATE2KX,CNTY2KX,TRACT2KX,e(0),se(e(0)),Abridged life table flag"
+                return response_mock
+
+            requests_mock.get = fake_get
+            mock_get_state_fips_codes.return_value = [
+                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
+            ]
+            # Instantiate the ETL class.
+            etl = self._get_instance_of_etl_class()
+
+            # Monkey-patch the temporary directory to the one used in the test
+            etl.TMP_PATH = tmp_path
+
+            # Run the extract method.
+            etl.extract()
+        return etl
+
     def test_init(self, mock_etl, mock_paths):
         etl = self._ETL_CLASS()
         data_path, _ = mock_paths
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py
index 9ddadce7a..dbffdfa5b 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_places/test_etl.py
@@ -23,7 +23,3 @@ def test_sample_data_exists(self):
         As per conversation with Jorge, here we can *just* test that the zip file exists.
         """
         assert (self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_FILE_NAME).exists()
-
-    def test_extract_unzips_base(self, mock_etl, mock_paths):
-        # We don't have a zip, so this test doesn't make sense
-        pass
\ No newline at end of file