Skip to content

Commit

Permalink
Check for unmatched source tracts (#1835)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Sep 2, 2022
1 parent 0d919fd commit 358a1fd
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion data/data-pipeline/data_pipeline/tests/score/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

pytestmark = pytest.mark.smoketest
GEOID_TRACT_FIELD_NAME = field_names.GEOID_TRACT_FIELD
UNMATCHED_TRACK_THRESHOLD = 1000


def _helper_test_count_exceeding_threshold(df, col, error_check=1000):
Expand Down Expand Up @@ -262,7 +263,7 @@ def test_data_sources(
on=GEOID_TRACT_FIELD_NAME,
indicator="MERGE",
suffixes=(final, f"_{data_source_name}"),
how="left",
how="outer",
)

# Make our lists of columns for later comparison
Expand All @@ -278,6 +279,11 @@ def test_data_sources(
# Make sure we have NAs for any tracts in the final data that aren't
# covered in the final data
assert np.all(df[df.MERGE == "left_only"][final_columns].isna())

# Make sure the datasource doesn't have a ton of unmatched tracts, implying it
# has moved to 2020 tracts
assert len(df[df.MERGE == "right_only"]) < UNMATCHED_TRACK_THRESHOLD

df = df[df.MERGE == "both"]

# Compare every column for equality, using close equality for numerics and
Expand Down

0 comments on commit 358a1fd

Please sign in to comment.