From ff0fa448a4efde1424926212f3b6090f9c27385a Mon Sep 17 00:00:00 2001 From: Jong Choi Date: Fri, 2 Aug 2024 11:32:43 -0400 Subject: [PATCH 1/3] Update deephyper runs Update to capture all errors with try-except block. --- examples/multidataset_hpo/gfm_deephyper_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multidataset_hpo/gfm_deephyper_multi.py b/examples/multidataset_hpo/gfm_deephyper_multi.py index f04f7246d..5ba8ee419 100644 --- a/examples/multidataset_hpo/gfm_deephyper_multi.py +++ b/examples/multidataset_hpo/gfm_deephyper_multi.py @@ -88,9 +88,9 @@ def run(trial, dequed=None): ) print("Command = ", command, flush=True, file=f) - result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) output = "F" try: + result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r") while True: From d6ba2682549abc931f35e5eeaa15c7305495abf3 Mon Sep 17 00:00:00 2001 From: Jong Choi Date: Fri, 2 Aug 2024 11:34:19 -0400 Subject: [PATCH 2/3] Update gfm_deephyper_multi_perlmutter.py --- examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py b/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py index cb10e1bff..094b9be28 100644 --- a/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py +++ b/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py @@ -89,9 +89,9 @@ def run(trial, dequed=None): ) print("Command = ", command, flush=True, file=f) - result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) output = "F" try: + result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r") while True: From 1fb809a75b078ecc4b9765b69694484e7baf49f8 Mon Sep 17 00:00:00 2001 From: Jong Choi Date: Fri, 2 Aug 2024 11:37:27 -0400 Subject: [PATCH 3/3] Update distributed.py Use "SLURM_STEP_NODELIST" env, which is needed for HPO. --- hydragnn/utils/distributed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hydragnn/utils/distributed.py b/hydragnn/utils/distributed.py index b533beb92..50c853776 100644 --- a/hydragnn/utils/distributed.py +++ b/hydragnn/utils/distributed.py @@ -147,6 +147,9 @@ def setup_ddp(use_deepspeed=False): master_addr = os.environ["LSB_HOSTS"].split()[1] elif os.getenv("LSB_MCPU_HOSTS") is not None: master_addr = os.environ["LSB_MCPU_HOSTS"].split()[2] + elif os.getenv("SLURM_STEP_NODELIST") is not None: + ## The following is CADES/Frontier/Perlmutter specific with job steps + master_addr = parse_slurm_nodelist(os.environ["SLURM_STEP_NODELIST"])[0] elif os.getenv("SLURM_NODELIST") is not None: ## The following is CADES specific master_addr = parse_slurm_nodelist(os.environ["SLURM_NODELIST"])[0]