From d42b7d6a162162014bc014158fffeb41e92f6a09 Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Tue, 8 Mar 2022 20:10:04 -0800 Subject: [PATCH] (torchx/scheduler) configure srun to pump stderr to *.err file instead of having everything redirected to stdout (#414) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/414 Addresses the log related QOL mentioned https://github.com/pytorch/torchx/issues/405. Split the stdout and stderr into two separate log files and make adjustments to the slurm_scheduler.log_iter implementation to support this. Reviewed By: d4l3k Differential Revision: D34729144 fbshipit-source-id: 7c05f8e30882dab35959278188a9f2deb698583e --- torchx/schedulers/slurm_scheduler.py | 26 +++++++++----- .../schedulers/test/slurm_scheduler_test.py | 36 ++++++++++++++++--- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py index 125da5f70..8c7695d52 100644 --- a/torchx/schedulers/slurm_scheduler.py +++ b/torchx/schedulers/slurm_scheduler.py @@ -11,14 +11,14 @@ """ import csv +import logging import os.path import shlex import subprocess import tempfile -import warnings from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, List, Mapping, Optional, Tuple, Iterable +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple from torchx.schedulers.api import AppDryRunInfo, DescribeAppResponse, Scheduler, Stream from torchx.schedulers.local_scheduler import LogIterator @@ -65,6 +65,8 @@ "constraint", } +log: logging.Logger = logging.getLogger(__name__) + def _apply_app_id_env(s: str) -> str: """ @@ -115,6 +117,7 @@ def from_role( srun_opts = { "output": f"slurm-{macros.app_id}-{name}.out", + "error": f"slurm-{macros.app_id}-{name}.err", } return cls( @@ -415,18 +418,23 @@ def log_iter( should_tail: bool = False, streams: Optional[Stream] = None, ) -> Iterable[str]: + if streams is None: + log.info("log stream not specified, defaulting to STDERR") + elif streams == Stream.COMBINED: + raise ValueError( + "SlurmScheduler does not support COMBINED log stream." + " Use `stdout` or `stderr`" + ) + + extension = "out" if streams == Stream.STDOUT else "err" + if since or until: - warnings.warn( + log.warning( "since and/or until times specified for SlurmScheduler.log_iter." " These will be ignored and all log lines will be returned" ) - if streams is not None and streams != Stream.COMBINED: - warnings.warn( - "streams specified for SlurmScheduler.log_iter." - " These will be ignored and all log lines will be returned" - ) - log_file = f"slurm-{app_id}-{role_name}-{k}.out" + log_file = f"slurm-{app_id}-{role_name}-{k}.{extension}" return LogIterator( app_id, regex or ".*", log_file, self, should_tail=should_tail diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py index c6b5e790a..09497434f 100644 --- a/torchx/schedulers/test/slurm_scheduler_test.py +++ b/torchx/schedulers/test/slurm_scheduler_test.py @@ -96,6 +96,7 @@ def test_replica_request(self) -> None: srun, [ '--output=slurm-"$SLURM_JOB_ID"-role-0.out', + '--error=slurm-"$SLURM_JOB_ID"-role-0.err', "--export=ALL,FOO=bar", "echo", "'hello slurm'", @@ -191,9 +192,9 @@ def test_dryrun_multi_role(self) -> None: export PYTHONUNBUFFERED=1 export SLURM_UNBUFFEREDIO=1 -srun --output=slurm-"$SLURM_JOB_ID"-a-0.out echo 0 'hello '"$SLURM_JOB_ID"'' :\\ - --output=slurm-"$SLURM_JOB_ID"-a-1.out echo 1 'hello '"$SLURM_JOB_ID"'' :\\ - --output=slurm-"$SLURM_JOB_ID"-b-0.out echo +srun --output=slurm-"$SLURM_JOB_ID"-a-0.out --error=slurm-"$SLURM_JOB_ID"-a-0.err echo 0 'hello '"$SLURM_JOB_ID"'' :\\ + --output=slurm-"$SLURM_JOB_ID"-a-1.out --error=slurm-"$SLURM_JOB_ID"-a-1.err echo 1 'hello '"$SLURM_JOB_ID"'' :\\ + --output=slurm-"$SLURM_JOB_ID"-b-0.out --error=slurm-"$SLURM_JOB_ID"-b-0.err echo """, ) @@ -356,12 +357,39 @@ def test_log_iter(self, run: MagicMock) -> None: "54", "echo", 1, - streams=Stream.STDERR, + streams=Stream.STDOUT, since=datetime.datetime.now(), ) ) self.assertEqual(logs, ["hello", "world"]) + with open("slurm-54-echo-1.err", "wt") as f: + f.write("foo\nbar\n") + + logs = list( + scheduler.log_iter( + "54", + "echo", + 1, + streams=Stream.STDERR, + ) + ) + + self.assertEqual(logs, ["foo", "bar"]) + + # no stream specified should default to STDERR + logs = list( + scheduler.log_iter( + "54", + "echo", + 1, + ) + ) + self.assertEqual(logs, ["foo", "bar"]) + + with self.assertRaises(ValueError): + scheduler.log_iter("54", "echo", 1, streams=Stream.COMBINED) + def test_dryrun_comment(self) -> None: scheduler = create_scheduler("foo") app = simple_app()