From b7c80e3f521b0d9dd593cb4267073a40c2e0825c Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 17 Aug 2022 10:21:38 -0400 Subject: [PATCH] Document new flag for datasets (#1780) --- .../data-pipeline/data_pipeline/etl/runner.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py index 11f64a9ce..6d98b1eca 100644 --- a/data/data-pipeline/data_pipeline/etl/runner.py +++ b/data/data-pipeline/data_pipeline/etl/runner.py @@ -76,9 +76,22 @@ def etl_runner(dataset_to_run: str = None) -> None: None """ dataset_list = _get_datasets_to_run(dataset_to_run) - # try running the high memory tasks separately - concurrent_datasets = [dataset for dataset in dataset_list if not dataset['is_memory_intensive']] - high_memory_datasets = [dataset for dataset in dataset_list if dataset['is_memory_intensive']] + + # Because we are memory constrained on our infrastructure, + # we split datasets into those that are not memory intensive + # (is_memory_intensive == False) and thereby can be safely + # run in parallel, and those that require more RAM and thus + # should be run sequentially. The is_memory_intensive_flag is + # set manually in constants.py based on experience running + # the pipeline + concurrent_datasets = [ + dataset + for dataset in dataset_list + if not dataset["is_memory_intensive"] + ] + high_memory_datasets = [ + dataset for dataset in dataset_list if dataset["is_memory_intensive"] + ] logger.info("Running concurrent jobs") with concurrent.futures.ThreadPoolExecutor() as executor: