From 0af511557d81f6424d75f88a459c172a8174f5b4 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Thu, 1 Feb 2024 22:19:05 +0100 Subject: [PATCH] arbiter: Use waitpid() facilities to handle worker exit status This change is meant to handle the return value of waitpid() in a way that is more in line with the man page of said syscall. The changes can be summarized as follows: * Use os.WIFEXITED and os.WIFSIGNALED to determine what caused waitpid() to return, and exactly how a worker may have exited. * In case of normal termination, use os.WEXITSTATUS() to read the exit status (instead of using a hand rolled bit shift). A redundant log was removed in this code path. * In case of termination by a signal, use os.WTERMSIG() to determine the signal which caused the worker to terminate. This was buggy before, since the WCOREFLAG (0x80) could cause e.g. a SIGSEGV (code 11) to be reported as "code 139", meaning "code (0x80 | 11)". * Since waitpid() isn't called with WSTOPPED nor WCONTINUED, there's no need to have any os.WIFSTOPPED or os.WIFCONTINUED handling. --- gunicorn/arbiter.py | 62 +++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index da94b4360..a9724cc6c 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -522,46 +522,42 @@ def reap_workers(self): break if self.reexec_pid == wpid: self.reexec_pid = 0 - else: - # A worker was terminated. If the termination reason was - # that it could not boot, we'll shut it down to avoid - # infinite start/stop cycles. - exitcode = status >> 8 + continue + + if os.WIFEXITED(status): + # A worker was normally terminated. If the termination + # reason was that it could not boot, we'll halt the server + # to avoid infinite start/stop cycles. + exitcode = os.WEXITSTATUS(status) if exitcode != 0: - self.log.error('Worker (pid:%s) exited with code %s', wpid, exitcode) + self.log.error('Worker (pid:%s) exited with code %s', + wpid, exitcode) if exitcode == self.WORKER_BOOT_ERROR: reason = "Worker failed to boot." raise HaltServer(reason, self.WORKER_BOOT_ERROR) if exitcode == self.APP_LOAD_ERROR: reason = "App failed to load." raise HaltServer(reason, self.APP_LOAD_ERROR) - - if exitcode > 0: - # If the exit code of the worker is greater than 0, - # let the user know. - self.log.error("Worker (pid:%s) exited with code %s.", - wpid, exitcode) - elif status > 0: - # If the exit code of the worker is 0 and the status - # is greater than 0, then it was most likely killed - # via a signal. - try: - sig_name = signal.Signals(status).name - except ValueError: - sig_name = "code {}".format(status) - msg = "Worker (pid:{}) was sent {}!".format( - wpid, sig_name) - - # Additional hint for SIGKILL - if status == signal.SIGKILL: - msg += " Perhaps out of memory?" - self.log.error(msg) - - worker = self.WORKERS.pop(wpid, None) - if not worker: - continue - worker.tmp.close() - self.cfg.child_exit(self, worker) + elif os.WIFSIGNALED(status): + # A worker was terminated by a signal. + sig = os.WTERMSIG(status) + try: + sig_name = signal.Signals(sig).name + except ValueError: + sig_name = "signal {}".format(sig) + msg = "Worker (pid:{}) was terminated by {}!".format( + wpid, sig_name) + + # Additional hint for SIGKILL + if sig == signal.SIGKILL: + msg += " Perhaps out of memory?" + self.log.error(msg) + + worker = self.WORKERS.pop(wpid, None) + if not worker: + continue + worker.tmp.close() + self.cfg.child_exit(self, worker) except OSError as e: if e.errno != errno.ECHILD: raise