From 571e9a65d5da82c8616e7e8e7bc33e8f692bad52 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Thu, 13 Oct 2022 14:07:45 -0400 Subject: [PATCH 1/2] Add assertion around codebook (#1505) --- .../data_pipeline/etl/score/etl_score_post.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index a896f4702..ae95d7939 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -532,6 +532,29 @@ def _load_downloadable_zip(self, downloadable_info_path: Path) -> None: "fields" ], ) + # Check the codebook to make sure it matches the download files + assert not set(codebook_df["csv_label"].dropna()).difference( + downloadable_df.columns + ), "Codebook is missing columns from downloadable CSV" + assert not set(codebook_df["excel_label"].dropna()).difference( + downloadable_df.columns + ), "Codebook is missing columns from downloadable excel" + assert ( + len( + downloadable_df.columns.difference( + set(codebook_df["csv_label"]) + ) + ) + == 0 + ), "Codebook has columns the CSV does not" + assert ( + len( + downloadable_df.columns.difference( + set(codebook_df["excel_label"]) + ) + ) + == 0 + ), "Codebook has columns the CSV does not" # load codebook to disk codebook_df.to_csv(codebook_path, index=False) From c549aa9e4fdd67eb0ab9adad814fc4fa9feec12c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Thu, 13 Oct 2022 14:54:56 -0400 Subject: [PATCH 2/2] Assert csv and excel have same cols (#1505) --- .../data_pipeline/etl/score/etl_score_post.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index ae95d7939..208af5c3c 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -532,13 +532,14 @@ def _load_downloadable_zip(self, downloadable_info_path: Path) -> None: "fields" ], ) + assert codebook_df["csv_label"].equals(codebook_df["excel_label"]), ( + "CSV and Excel differ. If that's intentional, " + "remove this assertion. Otherwise, fix it." + ) # Check the codebook to make sure it matches the download files assert not set(codebook_df["csv_label"].dropna()).difference( downloadable_df.columns - ), "Codebook is missing columns from downloadable CSV" - assert not set(codebook_df["excel_label"].dropna()).difference( - downloadable_df.columns - ), "Codebook is missing columns from downloadable excel" + ), "Codebook is missing columns from downloadable files" assert ( len( downloadable_df.columns.difference( @@ -546,15 +547,7 @@ def _load_downloadable_zip(self, downloadable_info_path: Path) -> None: ) ) == 0 - ), "Codebook has columns the CSV does not" - assert ( - len( - downloadable_df.columns.difference( - set(codebook_df["excel_label"]) - ) - ) - == 0 - ), "Codebook has columns the CSV does not" + ), "Codebook has columns the downloadable files do not" # load codebook to disk codebook_df.to_csv(codebook_path, index=False)