-
-
Notifications
You must be signed in to change notification settings - Fork 726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ensure Nanny doesn't restart workers that fail to start, and joins subprocess #6427
Changes from 24 commits
7d73e26
ba57395
0676b3b
2861e56
5e1d2ea
ea544e5
3405583
4d61b38
31db02a
de4daaf
babfd00
302d59e
00e277e
ee343aa
bdb632a
27e29b9
cd1e235
fa50856
8ed0062
11aaca9
68ec8df
8decd1a
bb9a35b
8dc579e
6dd8215
9e8e538
3dbebad
c976317
1d8b9e4
28c91a5
88abaf2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -405,7 +405,7 @@ async def instantiate(self) -> Status: | |
self.process = WorkerProcess( | ||
worker_kwargs=worker_kwargs, | ||
silence_logs=self.silence_logs, | ||
on_exit=self._on_exit_sync, | ||
on_exit=self._on_worker_exit_sync, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. driveby: renamed for clarity |
||
worker=self.Worker, | ||
env=self.env, | ||
config=self.config, | ||
|
@@ -497,16 +497,17 @@ def is_alive(self): | |
def run(self, comm, *args, **kwargs): | ||
return run(self, comm, *args, **kwargs) | ||
|
||
def _on_exit_sync(self, exitcode): | ||
self.loop.add_callback(self._on_exit, exitcode) | ||
def _on_worker_exit_sync(self, exitcode): | ||
self.loop.add_callback(self._on_worker_exit, exitcode) | ||
|
||
@log_errors | ||
async def _on_exit(self, exitcode): | ||
async def _on_worker_exit(self, exitcode): | ||
if self.status not in ( | ||
Status.init, | ||
Status.closing, | ||
Status.closed, | ||
Status.closing_gracefully, | ||
Status.failed, | ||
): | ||
try: | ||
await self._unregister() | ||
|
@@ -521,6 +522,7 @@ async def _on_exit(self, exitcode): | |
Status.closing, | ||
Status.closed, | ||
Status.closing_gracefully, | ||
Status.failed, | ||
): | ||
logger.warning("Restarting worker") | ||
await self.instantiate() | ||
|
@@ -581,7 +583,7 @@ async def close(self, timeout=5): | |
if self.process is not None: | ||
await self.kill(timeout=timeout) | ||
except Exception: | ||
pass | ||
logger.exception("Error in Nanny killing Worker subprocess") | ||
self.process = None | ||
await self.rpc.close() | ||
self.status = Status.closed | ||
|
@@ -666,15 +668,15 @@ async def start(self) -> Status: | |
await self.process.start() | ||
except OSError: | ||
logger.exception("Nanny failed to start process", exc_info=True) | ||
self.process.terminate() | ||
# NOTE: doesn't wait for process to terminate, just for terminate signal to be sent | ||
await self.process.terminate() | ||
self.status = Status.failed | ||
return self.status | ||
try: | ||
msg = await self._wait_until_connected(uid) | ||
except Exception: | ||
logger.exception("Failed to connect to process") | ||
# NOTE: doesn't wait for process to terminate, just for terminate signal to be sent | ||
await self.process.terminate() | ||
self.status = Status.failed | ||
self.process.terminate() | ||
raise | ||
if not msg: | ||
return self.status | ||
|
@@ -735,7 +737,12 @@ def mark_stopped(self): | |
async def kill(self, timeout: float = 2, executor_wait: bool = True): | ||
""" | ||
Ensure the worker process is stopped, waiting at most | ||
*timeout* seconds before terminating it abruptly. | ||
``timeout * 0.8`` seconds before killing it abruptly. | ||
|
||
When `kill` returns, the worker process has been joined. | ||
|
||
If the worker process does not terminate within ``timeout`` seconds, | ||
even after being killed, `asyncio.TimeoutError` is raised. | ||
""" | ||
deadline = time() + timeout | ||
|
||
|
@@ -744,32 +751,38 @@ async def kill(self, timeout: float = 2, executor_wait: bool = True): | |
if self.status == Status.stopping: | ||
await self.stopped.wait() | ||
return | ||
assert self.status in (Status.starting, Status.running) | ||
assert self.status in ( | ||
Status.starting, | ||
Status.running, | ||
Status.failed, # process failed to start, but hasn't been joined yet | ||
), self.status | ||
self.status = Status.stopping | ||
logger.info("Nanny asking worker to close") | ||
|
||
process = self.process | ||
assert self.process | ||
wait_timeout = timeout * 0.8 | ||
self.child_stop_q.put( | ||
{ | ||
"op": "stop", | ||
"timeout": max(0, deadline - time()) * 0.8, | ||
"timeout": wait_timeout, | ||
"executor_wait": executor_wait, | ||
} | ||
) | ||
await asyncio.sleep(0) # otherwise we get broken pipe errors | ||
self.child_stop_q.close() | ||
|
||
while process.is_alive() and time() < deadline: | ||
await asyncio.sleep(0.05) | ||
try: | ||
await process.join(wait_timeout) | ||
return | ||
except asyncio.TimeoutError: | ||
pass | ||
|
||
if process.is_alive(): | ||
logger.warning( | ||
f"Worker process still alive after {timeout} seconds, killing" | ||
) | ||
try: | ||
await process.terminate() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change: we used to first call Now, we still do |
||
except Exception as e: | ||
logger.error("Failed to kill worker process: %s", e) | ||
logger.warning( | ||
f"Worker process still alive after {wait_timeout} seconds, killing" | ||
) | ||
await process.kill() | ||
await process.join(max(0, deadline - time())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little wary of this deadline on the join. I could imagine the default 2s timeout not being long enough for the process to actually shut down in CI. I think we should probably make the default timeout a bit longer. |
||
|
||
async def _wait_until_connected(self, uid): | ||
while True: | ||
|
@@ -787,9 +800,6 @@ async def _wait_until_connected(self, uid): | |
continue | ||
|
||
if "exception" in msg: | ||
logger.error( | ||
"Failed while trying to start worker process: %s", msg["exception"] | ||
) | ||
raise msg["exception"] | ||
else: | ||
return msg | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
driveby: added so nobody in the future can try to override this