diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py index ed088cae2..d81ff0842 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py @@ -1,7 +1,7 @@ import pandas as pd from data_pipeline.config import settings -from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel from data_pipeline.utils import ( get_module_logger, unzip_file_from_url, @@ -11,6 +11,9 @@ class GeoCorrETL(ExtractTransformLoad): + NAME = "geocorr" + GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT + def __init__(self): self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr" @@ -24,6 +27,10 @@ def __init__(self): self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT" self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + self.URBAN_HEURISTIC_FIELD_NAME, + ] self.df: pd.DataFrame @@ -35,13 +42,11 @@ def extract(self) -> None: file_url=settings.AWS_JUSTICE40_DATASOURCES_URL + "/geocorr_urban_rural.csv.zip", download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() / "geocorr", + unzipped_file_path=self.get_tmp_path(), ) self.df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "geocorr" - / "geocorr_urban_rural.csv", + filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv", dtype={ self.GEOCORR_GEOID_FIELD_NAME: "string", }, @@ -50,22 +55,10 @@ def extract(self) -> None: def transform(self) -> None: logger.info("Starting GeoCorr Urban Rural Map transform") + # Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr - self.df.rename( + self.output_df = self.df.rename( columns={ "urban_heuristic_flag": self.URBAN_HEURISTIC_FIELD_NAME, }, - inplace=True, ) - - pass - - # Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr - - def load(self) -> None: - logger.info("Saving GeoCorr Urban Rural Map Data") - - # mkdir census - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - - self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/__init__.py b/data/data-pipeline/data_pipeline/tests/sources/geocorr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/extract.csv b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/extract.csv new file mode 100644 index 000000000..bcfcf9fa6 --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/extract.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,urban_heuristic_flag +06027000800,3378.0000000000,,1.0000000000,,0 +06061021322,2252.0000000000,6510.0000000000,0.2570189454,0.7429810546,1 +06069000802,2007.0000000000,527.0000000000,0.7920284136,0.2079715864,0 +15001021010,7884.0000000000,,1.0000000000,,0 +15001021101,3312.0000000000,219.0000000000,0.9379779099,0.0620220901,0 +15001021402,1532.0000000000,2493.0000000000,0.3806211180,0.6193788820,1 +15001021800,2725.0000000000,3597.0000000000,0.4310344828,0.5689655172,1 +15003010201,827.0000000000,5055.0000000000,0.1405984359,0.8594015641,1 +15007040603,789.0000000000,1755.0000000000,0.3101415094,0.6898584906,1 +15007040604,275.0000000000,2864.0000000000,0.0876075183,0.9123924817,1 +15007040700,814.0000000000,7589.0000000000,0.0968701654,0.9031298346,1 +15009030100,2291.0000000000,,1.0000000000,,0 +15009030201,1982.0000000000,471.0000000000,0.8079902161,0.1920097839,0 +15009030402,419.0000000000,8233.0000000000,0.0484281091,0.9515718909,1 +15009030800,1100.0000000000,5807.0000000000,0.1592587230,0.8407412770,1 diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv new file mode 100644 index 000000000..3dd9bf0dc --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,urban_heuristic_flag +06027000800,3378.0,,1.0,,0 +06061021322,2252.0,6510.0,0.25701894544624515,0.7429810545537548,1 +06069000802,2007.0,527.0,0.7920284135753749,0.2079715864246251,0 +15001021010,7884.0,,1.0,,0 +15001021101,3312.0,219.0,0.9379779099405268,0.062022090059473234,0 +15001021402,1532.0,2493.0,0.3806211180124224,0.6193788819875776,1 +15001021800,2725.0,3597.0,0.43103448275862066,0.5689655172413793,1 +15003010201,827.0,5055.0,0.14059843590615437,0.8594015640938456,1 +15007040603,789.0,1755.0,0.31014150943396224,0.6898584905660378,1 +15007040604,275.0,2864.0,0.08760751831793565,0.9123924816820643,1 +15007040700,814.0,7589.0,0.09687016541711294,0.9031298345828871,1 +15009030100,2291.0,,1.0,,0 +15009030201,1982.0,471.0,0.807990216061965,0.19200978393803506,0 +15009030402,419.0,8233.0,0.04842810910772076,0.9515718908922792,1 +15009030800,1100.0,5807.0,0.1592587230346026,0.8407412769653975,1 diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv.zip b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv.zip new file mode 100644 index 000000000..9ca015021 Binary files /dev/null and b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/geocorr_urban_rural.csv.zip differ diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/output.csv new file mode 100644 index 000000000..5311c0f4c --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/output.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,Urban Heuristic Flag +06027000800,0 +06061021322,1 +06069000802,0 +15001021010,0 +15001021101,0 +15001021402,1 +15001021800,1 +15003010201,1 +15007040603,1 +15007040604,1 +15007040700,1 +15009030100,0 +15009030201,0 +15009030402,1 +15009030800,1 diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/transform.csv new file mode 100644 index 000000000..2c12f718b --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/data/transform.csv @@ -0,0 +1,16 @@ +GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,Urban Heuristic Flag +06027000800,3378.0000000000,,1.0000000000,,0 +06061021322,2252.0000000000,6510.0000000000,0.2570189454,0.7429810546,1 +06069000802,2007.0000000000,527.0000000000,0.7920284136,0.2079715864,0 +15001021010,7884.0000000000,,1.0000000000,,0 +15001021101,3312.0000000000,219.0000000000,0.9379779099,0.0620220901,0 +15001021402,1532.0000000000,2493.0000000000,0.3806211180,0.6193788820,1 +15001021800,2725.0000000000,3597.0000000000,0.4310344828,0.5689655172,1 +15003010201,827.0000000000,5055.0000000000,0.1405984359,0.8594015641,1 +15007040603,789.0000000000,1755.0000000000,0.3101415094,0.6898584906,1 +15007040604,275.0000000000,2864.0000000000,0.0876075183,0.9123924817,1 +15007040700,814.0000000000,7589.0000000000,0.0968701654,0.9031298346,1 +15009030100,2291.0000000000,,1.0000000000,,0 +15009030201,1982.0000000000,471.0000000000,0.8079902161,0.1920097839,0 +15009030402,419.0000000000,8233.0000000000,0.0484281091,0.9515718909,1 +15009030800,1100.0000000000,5807.0000000000,0.1592587230,0.8407412770,1 diff --git a/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py new file mode 100644 index 000000000..bb065aaca --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py @@ -0,0 +1,19 @@ +import pathlib +from data_pipeline.tests.sources.example.test_etl import TestETL +from data_pipeline.etl.sources.geocorr.etl import GeoCorrETL + + +class TestGeoCorrETL(TestETL): + _ETL_CLASS = GeoCorrETL + + _SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data" + _SAMPLE_DATA_FILE_NAME = "geocorr_urban_rural.csv" + _SAMPLE_DATA_ZIP_FILE_NAME = "geocorr_urban_rural.csv.zip" + _EXTRACT_TMP_FOLDER_NAME = "GeoCorrETL" + + def setup_method(self, _method, filename=__file__): + """Invoke `setup_method` from Parent, but using the current file name. + + This code can be copied identically between all child classes. + """ + super().setup_method(_method=_method, filename=filename)