Merge branch 'main' into update_test_script

YuanTingHsieh · Oct 7, 2024 · 72f2442 · 72f2442
2 parents b8cfd82 + 3ec948e
commit 72f2442
Show file tree

Hide file tree

Showing 64 changed files with 98 additions and 99 deletions.
diff --git a/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb b/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "### Download Model Checkpoints\n",
     "\n",
-    "In order to download pretrained models from the NGC registry, **please ensure that you have installed and configured the NGC CLI**, check the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest/quickstart-fw.html) for more info. The following code will download the pretrained model `esm2nv_650M_converted.nemo` from the NGC registry."
+    "In order to download pretrained models from the NGC registry, **please ensure that you have installed and configured the NGC CLI**, check the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest) for more info. The following code will download the pretrained model `esm2nv_650M_converted.nemo` from the NGC registry."
    ]
   },
   {

diff --git a/...advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml b/...advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...onemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -13,7 +13,7 @@ encoder_frozen: False
 trainer:
   devices: 1 # number of GPUs or CPUs
   num_nodes: 1 
-  max_epochs: 200
+  max_epochs: 20
   val_check_interval: 0.0
   limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable
   limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...anced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...anced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/...ced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf b/...ced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf
@@ -35,7 +35,7 @@
             min_clients = 6
 
             # number of global round of the training.
-            num_rounds = 50
+            num_rounds = 20
 
             # starting round is 0-based
             start_round = 0

diff --git a/.../advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml b/.../advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...ionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...ionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...vanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...vanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/...s/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml b/...s/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -13,7 +13,7 @@ encoder_frozen: False
 trainer:
   devices: 1 # number of GPUs or CPUs
   num_nodes: 1 
-  max_epochs: 200
+  max_epochs: 20
   val_check_interval: 0.0
   limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable
   limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...dvanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...dvanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py b/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py
@@ -14,15 +14,14 @@
 
 from nvflare import SimulatorRunner
 
-n_clients = 6
-
 # Choose from one of the available jobs
 job_name = "central_sabdab_esm1nv"
-# job_name = "local_sabdab_esm1nv"
-# job_name = "fedavg_sabdab_esm1nv"
+n_clients = 1
+# job_name = "local_sabdab_esm1nv"; n_clients = 6
+# job_name = "fedavg_sabdab_esm1nv"; n_clients = 6
 
 simulator = SimulatorRunner(
-    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=1
+    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=n_clients
 )
 run_status = simulator.run()
 print("Simulator finished with run_status", run_status)
diff --git a/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml b/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml b/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app2/custom/base_config.yaml b/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app2/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app2/custom/downstream_flip_scl.yaml b/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app2/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app3/custom/base_config.yaml b/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app3/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app3/custom/downstream_flip_scl.yaml b/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app3/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app1/custom/base_config.yaml b/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app1/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml b/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app2/custom/base_config.yaml b/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app2/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app2/custom/downstream_flip_scl.yaml b/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app2/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app3/custom/base_config.yaml b/...vanced/bionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app3/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app3/custom/downstream_flip_scl.yaml b/...ionemo/downstream/scl/jobs/local_scl_finetune_esm2nv/app3/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/examples/advanced/bionemo/downstream/scl/run_sim_scl.py b/examples/advanced/bionemo/downstream/scl/run_sim_scl.py
@@ -21,7 +21,7 @@
 # job_name = "fedavg_scl_finetune_esm2nv"
 
 simulator = SimulatorRunner(
-    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=1
+    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=n_clients
 )
 run_status = simulator.run()
 print("Simulator finished with run_status", run_status)
diff --git a/...ples/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/base_config.yaml b/...ples/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...anced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/downstream_flip_tap.yaml b/...anced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/downstream_flip_tap.yaml
@@ -13,7 +13,7 @@ encoder_frozen: False
 trainer:
   devices: 1 # number of GPUs or CPUs
   num_nodes: 1 
-  max_epochs: 200
+  max_epochs: 20
   val_check_interval: 1
   limit_val_batches: 1000 # number of batches in validation step, use fraction for fraction of data, 0 to disable
   limit_test_batches: 1000 # number of batches in test step, use fraction for fraction of data, 0 to disable
@@ -53,7 +53,7 @@ model:
     sequence_column: "Antibody" # name of column with protein sequence in csv file
     target_column: ["PSH"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [1] # number of classes in each label for classifications or 1 for regression
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...s/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/pretrain_small.yaml b/...s/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app1/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/...ples/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app2/custom/base_config.yaml b/...ples/advanced/bionemo/downstream/tap/jobs/central_tap_esm1nv/app2/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset: