Actually deduplicate (#1780)

usds · Aug 17, 2022 · 7fc1d78 · 7fc1d78
1 parent b9113d7
commit 7fc1d78
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py
@@ -81,6 +81,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
     # try running the high memory tasks separately
     concurrent_datasets = dataset_list[:-2]
     high_memory_datasets = dataset_list[-2:]
+    logger.info("Running concurrent jobs")
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = {
             executor.submit(_run_one_dataset, dataset=dataset)
@@ -91,6 +92,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
             # Calling result will raise an exception if one occurred.
             # Otherwise, the exceptions are silently ignored.
             fut.result()
+    logger.info("Running high-memory jobs")
     for dataset in high_memory_datasets:
         _run_one_dataset(dataset=dataset)
 

diff --git a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
@@ -57,7 +57,7 @@ def transform(self) -> None:
         )
         gdf = gdf.drop_duplicates(subset=["geometry"], keep="last")
         gdf_tracts = add_tracts_for_geometries(gdf)
-        gdf = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
+        gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
         gdf_tracts[self.AML_BOOLEAN] = True
         self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]