From 98a02c496ea42974e127dc9243986fd1fd1f9f4c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Fri, 2 Sep 2022 11:44:47 -0400 Subject: [PATCH] Add tract test (#1835) --- .../data_pipeline/tests/score/fixtures.py | 12 +++++++++ .../data_pipeline/tests/score/test_output.py | 27 ++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 2b1c7fc68..64d80bfad 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -205,3 +205,15 @@ def hrs_df(): dtype={GEOID_TRACT_FIELD_NAME: "string"}, low_memory=False, ) + + +@pytest.fixture() +def national_tract_df(): + national_tract_csv = constants.DATA_CENSUS_CSV_FILE_PATH + return pd.read_csv( + national_tract_csv, + names=[GEOID_TRACT_FIELD_NAME], + dtype={GEOID_TRACT_FIELD_NAME: "string"}, + low_memory=False, + header=None, + ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index b79b5c711..6f73553eb 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -27,6 +27,7 @@ census_decennial_df, census_2010_df, hrs_df, + national_tract_df, ) @@ -255,7 +256,7 @@ def test_data_sources( } for data_source_name, data_source in data_sources.items(): - final = "final" + final = "final_" df: pd.DataFrame = final_score_df.merge( data_source, on=GEOID_TRACT_FIELD_NAME, @@ -264,21 +265,21 @@ def test_data_sources( how="left", ) - # Make sure we have NAs for any tracts in the final data that aren't - # covered in the final data - assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) - df = df[df.MERGE == "both"] - # Make our lists of columns for later comparison core_cols = data_source.columns.intersection( final_score_df.columns ).drop(GEOID_TRACT_FIELD_NAME) data_source_columns = [f"{col}_{data_source_name}" for col in core_cols] - final_columns = [f"{col}_{final}" for col in core_cols] + final_columns = [f"{col}{final}" for col in core_cols] assert ( final_columns ), f"No columns from data source show up in final score in source {data_source_name}" + # Make sure we have NAs for any tracts in the final data that aren't + # covered in the final data + assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) + df = df[df.MERGE == "both"] + # Compare every column for equality, using close equality for numerics and # `equals` equality for non-numeric columns for final_column, data_source_column in zip( @@ -302,3 +303,15 @@ def test_data_sources( df[data_source_column], equal_nan=True, ), error_message + + +def test_output_tracts(final_score_df, national_tract_df): + df = final_score_df.merge( + national_tract_df, + on=GEOID_TRACT_FIELD_NAME, + how="outer", + indicator="MERGE", + ) + counts = df.value_counts("MERGE") + assert counts.loc["left_only"] == 0 + assert counts.loc["right_only"] == 0