Skip to content

Commit d1e1403

Browse files
authored
manager: implement start (#1809)
Summary: This function starts a new TensorBoard process with the given arguments, or reuses an existing compatible process. It returns a `TensorboardInfo` object describing how to reach the resulting TensorBoard process (whether new or reused). See docs for more details. Test Plan: End-to-end tests included. These appear to be lightly flaky: I ran bazel test //tensorboard:manager_e2e_test --runs_per_test=100 six times on each of Python 2 and 3, and experienced three total failures on Python 2 and zero on Python 3. On my machine, the test takes 14.7±0.9s to run on Python 2, and 17.9±1.0s to run on Python 3. To test manually, run `bazel build //tensorboard`, then add that binary to your path and head over to a Python REPL: $ export PATH="$(readlink -e ./bazel-bin/tensorboard):$PATH" $ python >>> from tensorboard import manager >>> r1 = manager.start(["--logdir", "~/tensorboard_data", "--port", "0"]) >>> type(r1) <class 'tensorboard.manager.StartLaunched'> >>> r2 = manager.start(["--logdir", "~/tensorboard_data", "--port", "0"]) >>> type(r2) <class 'tensorboard.manager.StartReused'> >>> r1.info == r2.info True >>> r1.info.port 39081 >>> import os >>> os.system("curl --silent localhost:39081 | tail -c 64") <tf-tensorboard use-hash brand="TensorBoard"></tf-tensorboard> 0 >>> manager.get_all() == [r1.info] True >>> os.kill(r1.info.pid, 15) >>> manager.get_all() == [] True wchargin-branch: manager-start
1 parent 898db01 commit d1e1403

File tree

3 files changed

+477
-0
lines changed

3 files changed

+477
-0
lines changed

tensorboard/BUILD

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,27 @@ py_test(
9696
],
9797
)
9898

99+
py_test(
100+
name = "manager_e2e_test",
101+
size = "large", # spawns subprocesses, sleeps, makes requests to localhost
102+
timeout = "short", # about 15 seconds on my machine
103+
# On Python 2, this test fails about 0.5% of the time when run with
104+
# high parallelism; TensorBoard subprocess time out instead of
105+
# launching successfully.
106+
flaky = True,
107+
srcs = ["manager_e2e_test.py"],
108+
srcs_version = "PY2AND3",
109+
visibility = ["//tensorboard:internal"],
110+
deps = [
111+
":manager",
112+
"//tensorboard:expect_tensorflow_installed",
113+
"@org_pythonhosted_six",
114+
],
115+
data = [
116+
":tensorboard",
117+
],
118+
)
119+
99120
py_library(
100121
name = "program",
101122
srcs = ["program.py"],

tensorboard/manager.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
import errno
2525
import json
2626
import os
27+
import subprocess
2728
import tempfile
29+
import time
2830

2931
import six
3032

@@ -304,3 +306,135 @@ def get_all():
304306
else:
305307
results.append(info)
306308
return results
309+
310+
311+
# The following four types enumerate the possible return values of the
312+
# `start` function.
313+
314+
# Indicates that a call to `start` was compatible with an existing
315+
# TensorBoard process, which can be reused according to the provided
316+
# info.
317+
StartReused = collections.namedtuple("StartReused", ("info",))
318+
319+
# Indicates that a call to `start` successfully launched a new
320+
# TensorBoard process, which is available with the provided info.
321+
StartLaunched = collections.namedtuple("StartLaunched", ("info",))
322+
323+
# Indicates that a call to `start` tried to launch a new TensorBoard
324+
# instance, but the subprocess exited with the given exit code and
325+
# output streams. (If the contents of the output streams are no longer
326+
# available---e.g., because the user has emptied /tmp/---then the
327+
# corresponding values will be `None`.)
328+
StartFailed = collections.namedtuple(
329+
"StartFailed",
330+
(
331+
"exit_code", # int, as `Popen.returncode` (negative for signal)
332+
"stdout", # str, or `None` if the stream could not be read
333+
"stderr", # str, or `None` if the stream could not be read
334+
),
335+
)
336+
337+
# Indicates that a call to `start` launched a TensorBoard process, but
338+
# that process neither exited nor wrote its info file within the allowed
339+
# timeout period. The process may still be running under the included
340+
# PID.
341+
StartTimedOut = collections.namedtuple("StartTimedOut", ("pid",))
342+
343+
344+
def start(arguments, timeout=datetime.timedelta(seconds=10)):
345+
"""Start a new TensorBoard instance, or reuse a compatible one.
346+
347+
If the cache key determined by the provided arguments and the current
348+
working directory (see `cache_key`) matches the cache key of a running
349+
TensorBoard process (see `get_all`), that process will be reused.
350+
351+
Otherwise, a new TensorBoard process will be spawned with the provided
352+
arguments, using the `tensorboard` binary from the system path.
353+
354+
Args:
355+
arguments: List of strings to be passed as arguments to
356+
`tensorboard`. (If you have a raw command-line string, see
357+
`shlex.split`.)
358+
timeout: `datetime.timedelta` object describing how long to wait for
359+
the subprocess to initialize a TensorBoard server and write its
360+
`TensorboardInfo` file. If the info file is not written within
361+
this time period, `start` will assume that the subprocess is stuck
362+
in a bad state, and will give up on waiting for it and return a
363+
`StartTimedOut` result. Note that in such a case the subprocess
364+
will not be killed. Default value is 10 seconds.
365+
366+
Returns:
367+
A `StartReused`, `StartLaunched`, `StartFailed`, or `StartTimedOut`
368+
object.
369+
"""
370+
match = _find_matching_instance(
371+
cache_key(
372+
working_directory=os.getcwd(),
373+
arguments=arguments,
374+
configure_kwargs={},
375+
),
376+
)
377+
if match:
378+
return StartReused(info=match)
379+
380+
(stdout_fd, stdout_path) = tempfile.mkstemp(prefix=".tensorboard-stdout-")
381+
(stderr_fd, stderr_path) = tempfile.mkstemp(prefix=".tensorboard-stderr-")
382+
start_time = datetime.datetime.now()
383+
try:
384+
p = subprocess.Popen(
385+
["tensorboard"] + arguments,
386+
stdout=stdout_fd,
387+
stderr=stderr_fd,
388+
)
389+
finally:
390+
os.close(stdout_fd)
391+
os.close(stderr_fd)
392+
393+
poll_interval_seconds = 0.5
394+
end_time = start_time + timeout
395+
while datetime.datetime.now() < end_time:
396+
time.sleep(poll_interval_seconds)
397+
subprocess_result = p.poll()
398+
if subprocess_result is not None:
399+
return StartFailed(
400+
exit_code=subprocess_result,
401+
stdout=_maybe_read_file(stdout_path),
402+
stderr=_maybe_read_file(stderr_path),
403+
)
404+
for info in get_all():
405+
if info.pid == p.pid and info.start_time >= start_time:
406+
return StartLaunched(info=info)
407+
else:
408+
return StartTimedOut(pid=p.pid)
409+
410+
411+
def _find_matching_instance(cache_key):
412+
"""Find a running TensorBoard instance compatible with the cache key.
413+
414+
Returns:
415+
A `TensorboardInfo` object, or `None` if none matches the cache key.
416+
"""
417+
infos = get_all()
418+
candidates = [info for info in infos if info.cache_key == cache_key]
419+
for candidate in sorted(candidates, key=lambda x: x.port):
420+
# TODO(@wchargin): Check here that the provided port is still live.
421+
return candidate
422+
return None
423+
424+
425+
def _maybe_read_file(filename):
426+
"""Read the given file, if it exists.
427+
428+
Args:
429+
filename: A path to a file.
430+
431+
Returns:
432+
A string containing the file contents, or `None` if the file does
433+
not exist.
434+
"""
435+
try:
436+
with open(filename) as infile:
437+
return infile.read()
438+
except IOError as e:
439+
if e.errno == errno.ENOENT:
440+
return None

0 commit comments

Comments
 (0)