Skip to content

Commit

Permalink
Apply Emma's feedback from review (#1848)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Aug 31, 2022
1 parent ef6b9fc commit 11ee2bc
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 4 deletions.
4 changes: 2 additions & 2 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
census_tract_df = self._join_tract_dfs(census_tract_dfs)

# Drop tracts that don't exist in the 2010 tracts
pre_join_len = census_tract_df.shape[0]
pre_join_len = census_tract_df[field_names.GEOID_TRACT_FIELD].nunique()

census_tract_df = census_tract_df.merge(
self.national_tract_df,
Expand All @@ -407,7 +407,7 @@ def _prepare_initial_df(self) -> pd.DataFrame:
), "Join against national tract list ADDED rows"
logger.info(
"Dropped %s tracts not in the 2010 tract data",
pre_join_len - census_tract_df.shape[0],
pre_join_len - census_tract_df[field_names.GEOID_TRACT_FIELD].nunique()
)

# Now sanity-check the merged df.
Expand Down
3 changes: 3 additions & 0 deletions data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ def _create_score_data(
right_on=self.STATE_CODE_COLUMN,
how="left",
)
assert score_county_merged[
self.GEOID_TRACT_FIELD_NAME
].is_unique, "Merging state/county data introduced duplicate rows"
# set the score to the new df
return score_county_state_merged

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):

# Are the dataframes the same shape truly
assert tiles_df.shape == final_score_df.shape
assert (tiles_df.GTF == final_score_df.GTF).all()
assert tiles_df["GTF"].equals(final_score_df["GTF"])
assert sorted(tiles_df.columns) == sorted(final_score_df.columns)

# Are all the dtypes and values the same?
Expand All @@ -190,6 +190,6 @@ def test_for_column_fidelitiy_from_score(tiles_df, final_score_df):


def test_for_state_names(tiles_df):
states = tiles_df.SF.value_counts(dropna=False).index
states = tiles_df["SF"].value_counts(dropna=False).index
assert np.nan not in states
assert states.all()

0 comments on commit 11ee2bc

Please sign in to comment.