Skip to content

Commit

Permalink
save log in separate dir by default (#7825)
Browse files Browse the repository at this point in the history
Signed-off-by: daquexian <daquexian566@gmail.com>

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and xiacijie committed Apr 24, 2022
1 parent a77403d commit 1337d81
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 24 deletions.
34 changes: 14 additions & 20 deletions python/oneflow/distributed/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ def parse_args():
"--redirect_stdout_and_stderr",
default=False,
action="store_true",
help=f"write the stdout and stderr to files\n '{stdout_filename}' and '{stderr_filename}'. Only available when logdir is set",
help=f"write the stdout and stderr to files\n '{stdout_filename}' and '{stderr_filename}' in logdir.",
)
parser.add_argument(
"--logdir",
default=None,
default="log",
type=str,
help=f"Relative path to write subprocess logs to. Passing in a relative\n path will create a directory if needed. Note that\n successive runs with the same path to write logs to will overwrite existing logs,\n so be sure to save logs as needed.",
)
Expand Down Expand Up @@ -122,12 +122,11 @@ def main():
)

processes: List[Any] = []
if args.logdir:
if os.path.exists(args.logdir):
if not os.path.isdir(args.logdir):
raise ValueError("argument --logdir must be a path to a directory.")
else:
os.mkdir(os.path.join(os.getcwd(), args.logdir))
if os.path.exists(args.logdir):
if not os.path.isdir(args.logdir):
raise ValueError("argument --logdir must be a path to a directory.")
else:
os.mkdir(os.path.join(os.getcwd(), args.logdir))
subprocess_file_handles = []
for local_rank in range(0, args.nproc_per_node):
dist_rank = args.nproc_per_node * args.node_rank + local_rank
Expand All @@ -147,20 +146,15 @@ def main():
cmd.extend(args.training_script_args)
stdout_handle: Optional[IO]
stderr_handle: Optional[IO]
if args.logdir:
directory_path = os.path.join(
os.getcwd(), args.logdir, f"local_rank_{local_rank}"
)
os.makedirs(directory_path, exist_ok=True)
current_env["GLOG_log_dir"] = directory_path
log_directory_path = os.path.join(
os.getcwd(), args.logdir, f"local_rank_{local_rank}"
)
os.makedirs(log_directory_path, exist_ok=True)
current_env["GLOG_log_dir"] = log_directory_path
if args.redirect_stdout_and_stderr:
if not args.logdir:
raise ValueError(
"'redirect_stdout_and_stderr' is only available when 'logdir' is set."
)
node_rank = args.node_rank
stdout_handle = open(os.path.join(directory_path, stdout_filename), "w")
stderr_handle = open(os.path.join(directory_path, stderr_filename), "w")
stdout_handle = open(os.path.join(log_directory_path, stdout_filename), "w")
stderr_handle = open(os.path.join(log_directory_path, stderr_filename), "w")
subprocess_file_handles.append((stdout_handle, stderr_handle))
stdout_name = stdout_handle.name
stderr_name = stderr_handle.name
Expand Down
8 changes: 4 additions & 4 deletions python/oneflow/test/modules/test_eager_boxing_exhaustive.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _test_eager_boxing_normal_1d_exhaustive_testing(
placement=in_placement, sbp=elem[0]
)
y = x.to_global(placement=out_placement, sbp=elem[1])
test_case.assertTrue(np.allclose(y.numpy(), x.numpy()))
test_case.assertTrue(np.allclose(y.numpy(), x.numpy(), 1e-5, 1e-5))


def _test_eager_boxing_symmetric_2d_exhaustive_testing(
Expand All @@ -66,7 +66,7 @@ def _test_eager_boxing_symmetric_2d_exhaustive_testing(
placement=in_placement, sbp=elem[0]
)
y = x.to_global(placement=out_placement, sbp=elem[1])
test_case.assertTrue(np.allclose(y.numpy(), x.numpy()))
test_case.assertTrue(np.allclose(y.numpy(), x.numpy(), 1e-5, 1e-5))


def _test_eager_boxing_1d_special_split_axis(
Expand All @@ -86,7 +86,7 @@ def _test_eager_boxing_1d_special_split_axis(
placement=in_placement, sbp=elem[0]
)
y = x.to_global(placement=out_placement, sbp=elem[1])
test_case.assertTrue(np.allclose(y.numpy(), x.numpy()))
test_case.assertTrue(np.allclose(y.numpy(), x.numpy(), 1e-5, 1e-5))


def _test_eager_boxing_2d_special_split_axis(test_case, in_device, out_device):
Expand All @@ -107,7 +107,7 @@ def _test_eager_boxing_2d_special_split_axis(test_case, in_device, out_device):
placement=in_placement, sbp=elem[0]
)
y = x.to_global(placement=out_placement, sbp=elem[1])
test_case.assertTrue(np.allclose(y.numpy(), x.numpy()))
test_case.assertTrue(np.allclose(y.numpy(), x.numpy(), 1e-5, 1e-5))


@flow.unittest.skip_unless_1n4d()
Expand Down

0 comments on commit 1337d81

Please sign in to comment.