Skip to content

Commit

Permalink
Add tract test (#1835)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Sep 2, 2022
1 parent 9cd5eb6 commit 98a02c4
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 7 deletions.
12 changes: 12 additions & 0 deletions data/data-pipeline/data_pipeline/tests/score/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,15 @@ def hrs_df():
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)


@pytest.fixture()
def national_tract_df():
national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH
return pd.read_csv(
national_tract_csv,
names=[GEOID_TRACT_FIELD_NAME],
dtype={GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
header=None,
)
27 changes: 20 additions & 7 deletions data/data-pipeline/data_pipeline/tests/score/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
census_decennial_df,
census_2010_df,
hrs_df,
national_tract_df,
)


Expand Down Expand Up @@ -255,7 +256,7 @@ def test_data_sources(
}

for data_source_name, data_source in data_sources.items():
final = "final"
final = "final_"
df: pd.DataFrame = final_score_df.merge(
data_source,
on=GEOID_TRACT_FIELD_NAME,
Expand All @@ -264,21 +265,21 @@ def test_data_sources(
how="left",
)

# Make sure we have NAs for any tracts in the final data that aren't
# covered in the final data
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
df = df[df.MERGE == "both"]

# Make our lists of columns for later comparison
core_cols = data_source.columns.intersection(
final_score_df.columns
).drop(GEOID_TRACT_FIELD_NAME)
data_source_columns = [f"{col}_{data_source_name}" for col in core_cols]
final_columns = [f"{col}_{final}" for col in core_cols]
final_columns = [f"{col}{final}" for col in core_cols]
assert (
final_columns
), f"No columns from data source show up in final score in source {data_source_name}"

# Make sure we have NAs for any tracts in the final data that aren't
# covered in the final data
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())
df = df[df.MERGE == "both"]

# Compare every column for equality, using close equality for numerics and
# `equals` equality for non-numeric columns
for final_column, data_source_column in zip(
Expand All @@ -302,3 +303,15 @@ def test_data_sources(
df[data_source_column],
equal_nan=True,
), error_message


def test_output_tracts(final_score_df, national_tract_df):
df = final_score_df.merge(
national_tract_df,
on=GEOID_TRACT_FIELD_NAME,
how="outer",
indicator="MERGE",
)
counts = df.value_counts("MERGE")
assert counts.loc["left_only"] == 0
assert counts.loc["right_only"] == 0

0 comments on commit 98a02c4

Please sign in to comment.