Apply Emma's feedback from review (#1848)

usds · Aug 31, 2022 · 11ee2bc · 11ee2bc
1 parent ef6b9fc
commit 11ee2bc
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 4 deletions.
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -395,7 +395,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
         census_tract_df = self._join_tract_dfs(census_tract_dfs)
 
         # Drop tracts that don't exist in the 2010 tracts
-        pre_join_len = census_tract_df.shape[0]
+        pre_join_len = census_tract_df[field_names.GEOID_TRACT_FIELD].nunique()
 
         census_tract_df = census_tract_df.merge(
             self.national_tract_df,
@@ -407,7 +407,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
         ), "Join against national tract list ADDED rows"
         logger.info(
             "Dropped %s tracts not in the 2010 tract data",
-            pre_join_len - census_tract_df.shape[0],
+            pre_join_len - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique()
         )
 
         # Now sanity-check the merged df.

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -219,6 +219,9 @@ def _create_score_data(
             right_on=self.STATE_CODE_COLUMN,
             how="left",
         )
+        assert score_county_merged[
+            self.GEOID_TRACT_FIELD_NAME
+        ].is_unique, "Merging state/county data introduced duplicate rows"
         # set the score to the new df
         return score_county_state_merged
 

diff --git a/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py b/data/data-pipeline/data_pipeline/tests/score/test_tiles_smoketests.py
@@ -174,7 +174,7 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
 
     # Are the dataframes the same shape truly
     assert tiles_df.shape == final_score_df.shape
-    assert (tiles_df.GTF == final_score_df.GTF).all()
+    assert tiles_df["GTF"].equals(final_score_df["GTF"])
     assert sorted(tiles_df.columns) == sorted(final_score_df.columns)
 
     # Are all the dtypes and values the same?
@@ -190,6 +190,6 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):
 
 
 def test_for_state_names(tiles_df):
-    states = tiles_df.SF.value_counts(dropna=False).index
+    states = tiles_df["SF"].value_counts(dropna=False).index
     assert np.nan not in states
     assert states.all()