diff --git a/examples/multidataset_hpo/gfm_deephyper_multi.py b/examples/multidataset_hpo/gfm_deephyper_multi.py index f04f7246d..5ba8ee419 100644 --- a/examples/multidataset_hpo/gfm_deephyper_multi.py +++ b/examples/multidataset_hpo/gfm_deephyper_multi.py @@ -88,9 +88,9 @@ def run(trial, dequed=None): ) print("Command = ", command, flush=True, file=f) - result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) output = "F" try: + result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r") while True: diff --git a/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py b/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py index cb10e1bff..094b9be28 100644 --- a/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py +++ b/examples/multidataset_hpo/gfm_deephyper_multi_perlmutter.py @@ -89,9 +89,9 @@ def run(trial, dequed=None): ) print("Command = ", command, flush=True, file=f) - result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) output = "F" try: + result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r") while True: diff --git a/hydragnn/utils/distributed.py b/hydragnn/utils/distributed.py index b533beb92..50c853776 100644 --- a/hydragnn/utils/distributed.py +++ b/hydragnn/utils/distributed.py @@ -147,6 +147,9 @@ def setup_ddp(use_deepspeed=False): master_addr = os.environ["LSB_HOSTS"].split()[1] elif os.getenv("LSB_MCPU_HOSTS") is not None: master_addr = os.environ["LSB_MCPU_HOSTS"].split()[2] + elif os.getenv("SLURM_STEP_NODELIST") is not None: + ## The following is CADES/Frontier/Perlmutter specific with job steps + master_addr = parse_slurm_nodelist(os.environ["SLURM_STEP_NODELIST"])[0] elif os.getenv("SLURM_NODELIST") is not None: ## The following is CADES specific master_addr = parse_slurm_nodelist(os.environ["SLURM_NODELIST"])[0]