diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 5cc038fcc..53117907a 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -395,19 +395,20 @@ def _prepare_initial_df(self) -> pd.DataFrame: census_tract_df = self._join_tract_dfs(census_tract_dfs) # Drop tracts that don't exist in the 2010 tracts - pre_join_len = census_tract_df.shape[0] + pre_join_len = census_tract_df.shape[0] census_tract_df = census_tract_df.merge( self.national_tract_df, on="GEOID10_TRACT", how="inner", ) - + assert ( + census_tract_df.shape[0] <= pre_join_len + ), "Join against national tract list ADDED rows" logger.info( "Dropped %s tracts not in the 2010 tract data", pre_join_len - census_tract_df.shape[0], - ) # If GEOID10s are read as numbers instead of strings, the initial 0 is dropped, - # and then we get too many CBG rows (one for 012345 and one for 12345). + ) # Now sanity-check the merged df. self._census_tract_df_sanity_check( diff --git a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py index bf787ba7e..f2a1da7b9 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py @@ -136,6 +136,7 @@ def error_message(self) -> Optional[str]: f"score_df: {self.final_score_dtype}, " f"tile_df: {self.tile_dtype}" ) + return None def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):