Skip to content

Commit

Permalink
Document new flag for datasets (#1780)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattbowen-usds committed Aug 17, 2022
1 parent ee6cb44 commit 24802a9
Showing 1 changed file with 16 additions and 3 deletions.
19 changes: 16 additions & 3 deletions data/data-pipeline/data_pipeline/etl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,22 @@ def etl_runner(dataset_to_run: str = None) -> None:
None
"""
dataset_list = _get_datasets_to_run(dataset_to_run)
# try running the high memory tasks separately
concurrent_datasets = [dataset for dataset in dataset_list if not dataset['is_memory_intensive']]
high_memory_datasets = [dataset for dataset in dataset_list if dataset['is_memory_intensive']]

# Because we are memory constrained on our infrastructure,
# we split datasets into those that are not memory intensive
# (is_memory_intensive == False) and thereby can be safely
# run in parallel, and those that require more RAM and thus
# should be run sequentially. The is_memory_intensive_flag is
# set manually in constants.py based on experience running
# the pipeline
concurrent_datasets = [
dataset
for dataset in dataset_list
if not dataset["is_memory_intensive"]
]
high_memory_datasets = [
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
]

logger.info("Running concurrent jobs")
with concurrent.futures.ThreadPoolExecutor() as executor:
Expand Down

0 comments on commit 24802a9

Please sign in to comment.