Document new flag for datasets (#1780)

usds · Aug 17, 2022 · 24802a9 · 24802a9
1 parent ee6cb44
commit 24802a9
Showing 1 changed file with 16 additions and 3 deletions.
diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py
@@ -76,9 +76,22 @@ def etl_runner(dataset_to_run: str = None) -> None:
         None
     """
     dataset_list = _get_datasets_to_run(dataset_to_run)
-    # try running the high memory tasks separately
-    concurrent_datasets = [dataset for dataset in dataset_list if not dataset['is_memory_intensive']]
-    high_memory_datasets = [dataset for dataset in dataset_list if dataset['is_memory_intensive']]
+
+    # Because we are memory constrained on our infrastructure,
+    # we split datasets into those that are not memory intensive
+    # (is_memory_intensive == False) and thereby can be safely
+    # run in parallel, and those that require more RAM and thus
+    # should be run sequentially. The is_memory_intensive_flag is
+    # set manually in constants.py based on experience running
+    # the pipeline
+    concurrent_datasets = [
+        dataset
+        for dataset in dataset_list
+        if not dataset["is_memory_intensive"]
+    ]
+    high_memory_datasets = [
+        dataset for dataset in dataset_list if dataset["is_memory_intensive"]
+    ]
 
     logger.info("Running concurrent jobs")
     with concurrent.futures.ThreadPoolExecutor() as executor: