Skip to content

Commit

Permalink
Add tests for GeoCorr (#1848)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Sep 12, 2022
1 parent bbd78c7 commit 72feee2
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 19 deletions.
31 changes: 12 additions & 19 deletions data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd

from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import (
get_module_logger,
unzip_file_from_url,
Expand All @@ -11,6 +11,9 @@


class GeoCorrETL(ExtractTransformLoad):
NAME = "geocorr"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT

def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"

Expand All @@ -24,6 +27,10 @@ def __init__(self):
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
self.URBAN_HEURISTIC_FIELD_NAME,
]

self.df: pd.DataFrame

Expand All @@ -35,13 +42,11 @@ def extract(self) -> None:
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "geocorr",
unzipped_file_path=self.get_tmp_path(),
)

self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "geocorr"
/ "geocorr_urban_rural.csv",
filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
dtype={
self.GEOCORR_GEOID_FIELD_NAME: "string",
},
Expand All @@ -50,22 +55,10 @@ def extract(self) -> None:

def transform(self) -> None:
logger.info("Starting GeoCorr Urban Rural Map transform")
# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr

self.df.rename(
self.output_df = self.df.rename(
columns={
"urban_heuristic_flag": self.URBAN_HEURISTIC_FIELD_NAME,
},
inplace=True,
)

pass

# Put in logic from Jupyter Notebook transform when we switch in the hyperlink to Geocorr

def load(self) -> None:
logger.info("Saving GeoCorr Urban Rural Map Data")

# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,urban_heuristic_flag
06027000800,3378.0000000000,,1.0000000000,,0
06061021322,2252.0000000000,6510.0000000000,0.2570189454,0.7429810546,1
06069000802,2007.0000000000,527.0000000000,0.7920284136,0.2079715864,0
15001021010,7884.0000000000,,1.0000000000,,0
15001021101,3312.0000000000,219.0000000000,0.9379779099,0.0620220901,0
15001021402,1532.0000000000,2493.0000000000,0.3806211180,0.6193788820,1
15001021800,2725.0000000000,3597.0000000000,0.4310344828,0.5689655172,1
15003010201,827.0000000000,5055.0000000000,0.1405984359,0.8594015641,1
15007040603,789.0000000000,1755.0000000000,0.3101415094,0.6898584906,1
15007040604,275.0000000000,2864.0000000000,0.0876075183,0.9123924817,1
15007040700,814.0000000000,7589.0000000000,0.0968701654,0.9031298346,1
15009030100,2291.0000000000,,1.0000000000,,0
15009030201,1982.0000000000,471.0000000000,0.8079902161,0.1920097839,0
15009030402,419.0000000000,8233.0000000000,0.0484281091,0.9515718909,1
15009030800,1100.0000000000,5807.0000000000,0.1592587230,0.8407412770,1
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,urban_heuristic_flag
06027000800,3378.0,,1.0,,0
06061021322,2252.0,6510.0,0.25701894544624515,0.7429810545537548,1
06069000802,2007.0,527.0,0.7920284135753749,0.2079715864246251,0
15001021010,7884.0,,1.0,,0
15001021101,3312.0,219.0,0.9379779099405268,0.062022090059473234,0
15001021402,1532.0,2493.0,0.3806211180124224,0.6193788819875776,1
15001021800,2725.0,3597.0,0.43103448275862066,0.5689655172413793,1
15003010201,827.0,5055.0,0.14059843590615437,0.8594015640938456,1
15007040603,789.0,1755.0,0.31014150943396224,0.6898584905660378,1
15007040604,275.0,2864.0,0.08760751831793565,0.9123924816820643,1
15007040700,814.0,7589.0,0.09687016541711294,0.9031298345828871,1
15009030100,2291.0,,1.0,,0
15009030201,1982.0,471.0,0.807990216061965,0.19200978393803506,0
15009030402,419.0,8233.0,0.04842810910772076,0.9515718908922792,1
15009030800,1100.0,5807.0,0.1592587230346026,0.8407412769653975,1
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10_TRACT,Urban Heuristic Flag
06027000800,0
06061021322,1
06069000802,0
15001021010,0
15001021101,0
15001021402,1
15001021800,1
15003010201,1
15007040603,1
15007040604,1
15007040700,1
15009030100,0
15009030201,0
15009030402,1
15009030800,1
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
GEOID10_TRACT,population_in_rural_areas,population_in_urban_areas,perc_population_in_rural_areas,perc_population_in_urban_areas,Urban Heuristic Flag
06027000800,3378.0000000000,,1.0000000000,,0
06061021322,2252.0000000000,6510.0000000000,0.2570189454,0.7429810546,1
06069000802,2007.0000000000,527.0000000000,0.7920284136,0.2079715864,0
15001021010,7884.0000000000,,1.0000000000,,0
15001021101,3312.0000000000,219.0000000000,0.9379779099,0.0620220901,0
15001021402,1532.0000000000,2493.0000000000,0.3806211180,0.6193788820,1
15001021800,2725.0000000000,3597.0000000000,0.4310344828,0.5689655172,1
15003010201,827.0000000000,5055.0000000000,0.1405984359,0.8594015641,1
15007040603,789.0000000000,1755.0000000000,0.3101415094,0.6898584906,1
15007040604,275.0000000000,2864.0000000000,0.0876075183,0.9123924817,1
15007040700,814.0000000000,7589.0000000000,0.0968701654,0.9031298346,1
15009030100,2291.0000000000,,1.0000000000,,0
15009030201,1982.0000000000,471.0000000000,0.8079902161,0.1920097839,0
15009030402,419.0000000000,8233.0000000000,0.0484281091,0.9515718909,1
15009030800,1100.0000000000,5807.0000000000,0.1592587230,0.8407412770,1
19 changes: 19 additions & 0 deletions data/data-pipeline/data_pipeline/tests/sources/geocorr/test_etl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pathlib
from data_pipeline.tests.sources.example.test_etl import TestETL
from data_pipeline.etl.sources.geocorr.etl import GeoCorrETL


class TestGeoCorrETL(TestETL):
_ETL_CLASS = GeoCorrETL

_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "geocorr_urban_rural.csv"
_SAMPLE_DATA_ZIP_FILE_NAME = "geocorr_urban_rural.csv.zip"
_EXTRACT_TMP_FOLDER_NAME = "GeoCorrETL"

def setup_method(self, _method, filename=__file__):
"""Invoke `setup_method` from Parent, but using the current file name.
This code can be copied identically between all child classes.
"""
super().setup_method(_method=_method, filename=filename)

0 comments on commit 72feee2

Please sign in to comment.