Skip to content

Commit

Permalink
Test a few datasets for overlap in the final score (#1835)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Sep 1, 2022
1 parent b26d359 commit ec79f87
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 1 deletion.
87 changes: 87 additions & 0 deletions data/data-pipeline/data_pipeline/tests/score/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import pytest
from data_pipeline.config import settings
from data_pipeline.score import field_names
from data_pipeline.etl.score import constants

GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD


@pytest.fixture(scope="session")
Expand All @@ -11,3 +14,87 @@ def final_score_df():
dtype={field_names.GEOID_TRACT_FIELD: str},
low_memory=False,
)


@pytest.fixture(scope="session")
def census_df():
census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
return pd.read_csv(
census_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def ejscreen_df():
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
return pd.read_csv(
ejscreen_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def hud_housing_df():
hud_housing_csv = (
constants.DATA_PATH / "dataset" / "hud_housing" / "usa.csv"
)
return pd.read_csv(
hud_housing_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def cdc_places_df():
cdc_places_csv = constants.DATA_PATH / "dataset" / "cdc_places" / "usa.csv"
return pd.read_csv(
cdc_places_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def census_acs_median_incomes_df():
census_acs_median_incomes_csv = (
constants.DATA_PATH
/ "dataset"
/ "census_acs_median_income_2019"
/ "usa.csv"
)
return pd.read_csv(
census_acs_median_incomes_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def cdc_life_expectancy_df():
cdc_life_expectancy_csv = (
constants.DATA_PATH / "dataset" / "cdc_life_expectancy" / "usa.csv"
)
return pd.read_csv(
cdc_life_expectancy_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture(scope="session")
def doe_energy_burden_df():
doe_energy_burden_csv = (
constants.DATA_PATH / "dataset" / "doe_energy_burden" / "usa.csv"
)
return pd.read_csv(
doe_energy_burden_csv,
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


# TODO: The datasets that are loaded from data_pipeline/etl/score/etl_score.py:131
52 changes: 51 additions & 1 deletion data/data-pipeline/data_pipeline/tests/score/test_output.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
# flake8: noqa: W0613,W0611,F811
# pylint: disable=unused-import
import inspect
from dataclasses import dataclass
from typing import List
import pytest
import pandas as pd
import numpy as np
from data_pipeline.score import field_names
from .fixtures import final_score_df # pylint: disable=unused-import
from .fixtures import (
final_score_df,
ejscreen_df,
hud_housing_df,
cdc_places_df,
)

pytestmark = pytest.mark.smoketest
GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD


def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
Expand Down Expand Up @@ -203,3 +212,44 @@ def test_donut_hole_addition_to_score_n(final_score_df):
assert (
new_donuts > 0
), "FYI: The adjacency index is doing nothing. Consider removing it?"


def test_data_sources(
final_score_df, hud_housing_df, ejscreen_df, cdc_places_df
):
data_sources = {
key: value for key, value in locals().items() if key != "final_score_df"
}

for data_source_name, data_source in data_sources.items():
final = "_final"
df: pd.DataFrame = final_score_df.merge(
data_source,
on=GEOID_TRACT_FIELD_NAME,
indicator="MERGE",
suffixes=(final, f"_{data_source_name}"),
how="left",
)
data_source_columns = [
f"{col}_{data_source_name}"
for col in data_source.columns
if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns)
]
final_columns = [
f"{col}{final}"
for col in data_source.columns
if (col != GEOID_TRACT_FIELD_NAME and col in final_score_df.columns)
]
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
df = df[df.MERGE == "both"]
assert (
final_columns
), "No columns from data source show up in final score"
for final_column, data_source_column in zip(
data_source_columns, final_columns
):
assert np.allclose(
df[final_column],
df[data_source_column],
equal_nan=True,
), f"Column {final_column} not equal between {data_source_name} and final score"

0 comments on commit ec79f87

Please sign in to comment.