Skip to content

Commit

Permalink
Update deephyper runs (ORNL#274)
Browse files Browse the repository at this point in the history
* Update deephyper runs

Update to capture all errors with try-except block.

* Update gfm_deephyper_multi_perlmutter.py

* Update distributed.py

Use "SLURM_STEP_NODELIST" env, which is needed for HPO.
  • Loading branch information
jychoi-hpc authored and RylieWeaver committed Sep 25, 2024
1 parent c0a71b8 commit 4283d15
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 2 deletions.
2 changes: 1 addition & 1 deletion examples/multidataset_hpo/gfm_deephyper_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ def run(trial, dequed=None):
)
print("Command = ", command, flush=True, file=f)

result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
output = "F"
try:
result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)"
fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r")
while True:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def run(trial, dequed=None):
)
print("Command = ", command, flush=True, file=f)

result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
output = "F"
try:
result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)"
fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r")
while True:
Expand Down
3 changes: 3 additions & 0 deletions hydragnn/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ def setup_ddp(use_deepspeed=False):
master_addr = os.environ["LSB_HOSTS"].split()[1]
elif os.getenv("LSB_MCPU_HOSTS") is not None:
master_addr = os.environ["LSB_MCPU_HOSTS"].split()[2]
elif os.getenv("SLURM_STEP_NODELIST") is not None:
## The following is CADES/Frontier/Perlmutter specific with job steps
master_addr = parse_slurm_nodelist(os.environ["SLURM_STEP_NODELIST"])[0]
elif os.getenv("SLURM_NODELIST") is not None:
## The following is CADES specific
master_addr = parse_slurm_nodelist(os.environ["SLURM_NODELIST"])[0]
Expand Down

0 comments on commit 4283d15

Please sign in to comment.