Skip to content

Commit

Permalink
Actually deduplicate (#1780)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Aug 17, 2022
1 parent b9113d7 commit 7fc1d78
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 1 deletion.
2 changes: 2 additions & 0 deletions data/data-pipeline/data_pipeline/etl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
# try running the high memory tasks separately
concurrent_datasets = dataset_list[:-2]
high_memory_datasets = dataset_list[-2:]
logger.info("Running concurrent jobs")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(_run_one_dataset, dataset=dataset)
Expand All @@ -91,6 +92,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
# Calling result will raise an exception if one occurred.
# Otherwise, the exceptions are silently ignored.
fut.result()
logger.info("Running high-memory jobs")
for dataset in high_memory_datasets:
_run_one_dataset(dataset=dataset)

Expand Down
2 changes: 1 addition & 1 deletion data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def transform(self) -> None:
)
gdf = gdf.drop_duplicates(subset=["geometry"], keep="last")
gdf_tracts = add_tracts_for_geometries(gdf)
gdf = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
gdf_tracts[self.AML_BOOLEAN] = True
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]

0 comments on commit 7fc1d78

Please sign in to comment.