From 5b33c013d359b43d7bf951028f7e6daea415f7f5 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Fri, 2 Feb 2024 11:28:54 +0100 Subject: [PATCH 1/6] arbiter: Handle SIGCHLD in normal/main process context ... as opposed to in signal context. This is beneficial, since it means that we can, in a signal safe way, print messages about why e.g. a worker stopped its execution. And since handle_sigchld() logs what it does anyway, don't bother printing out that we're handling SIGCHLD. If workers are killed at rapid pace, we won't get as many SIGCHLD as there are workers killed anyway. --- gunicorn/arbiter.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index 1eaf453d5..465b9db11 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -40,12 +40,9 @@ class Arbiter: # I love dynamic languages SIG_QUEUE = [] - SIGNALS = [getattr(signal, "SIG%s" % x) - for x in "HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()] - SIG_NAMES = dict( - (getattr(signal, name), name[3:].lower()) for name in dir(signal) - if name[:3] == "SIG" and name[3] != "_" - ) + SIGNALS = [getattr(signal.Signals, "SIG%s" % x) + for x in "CHLD HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()] + SIG_NAMES = dict((sig, sig.name[3:].lower()) for sig in SIGNALS) def __init__(self, app): os.environ["SERVER_SOFTWARE"] = SERVER_SOFTWARE @@ -185,7 +182,6 @@ def init_signals(self): # initialize all signals for s in self.SIGNALS: signal.signal(s, self.signal) - signal.signal(signal.SIGCHLD, self.handle_chld) def signal(self, sig, frame): if len(self.SIG_QUEUE) < 5: @@ -219,7 +215,8 @@ def run(self): if not handler: self.log.error("Unhandled signal: %s", signame) continue - self.log.info("Handling signal: %s", signame) + if sig != signal.SIGCHLD: + self.log.info("Handling signal: %s", signame) handler() self.wakeup() except (StopIteration, KeyboardInterrupt): @@ -236,10 +233,9 @@ def run(self): self.pidfile.unlink() sys.exit(-1) - def handle_chld(self, sig, frame): + def handle_chld(self): "SIGCHLD handling" self.reap_workers() - self.wakeup() def handle_hup(self): """\ @@ -391,7 +387,10 @@ def stop(self, graceful=True): # instruct the workers to exit self.kill_workers(sig) # wait until the graceful timeout - while self.WORKERS and time.time() < limit: + while True: + self.reap_workers() + if not self.WORKERS or time.time() >= limit: + break time.sleep(0.1) self.kill_workers(signal.SIGKILL) From d653ebc07cd8e633444f067170f755c4af6edfb1 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Fri, 2 Feb 2024 12:36:36 +0100 Subject: [PATCH 2/6] arbiter: Remove PIPE and only use SIG_QUEUE instead Since we can use something from queue.*, we can make it blocking as well, removing the need for two different data structures. --- gunicorn/arbiter.py | 90 +++++++++++---------------------------------- 1 file changed, 22 insertions(+), 68 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index 465b9db11..3aa31c980 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -4,11 +4,11 @@ import errno import os import random -import select import signal import sys import time import traceback +import queue from gunicorn.errors import HaltServer, AppImportError from gunicorn.pidfile import Pidfile @@ -36,10 +36,9 @@ class Arbiter: LISTENERS = [] WORKERS = {} - PIPE = [] # I love dynamic languages - SIG_QUEUE = [] + SIG_QUEUE = queue.SimpleQueue() SIGNALS = [getattr(signal.Signals, "SIG%s" % x) for x in "CHLD HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()] SIG_NAMES = dict((sig, sig.name[3:].lower()) for sig in SIGNALS) @@ -167,16 +166,6 @@ def init_signals(self): Initialize master signal handling. Most of the signals are queued. Child signals only wake up the master. """ - # close old PIPE - for p in self.PIPE: - os.close(p) - - # initialize the pipe - self.PIPE = pair = os.pipe() - for p in pair: - util.set_non_blocking(p) - util.close_on_exec(p) - self.log.close_on_exec() # initialize all signals @@ -184,9 +173,8 @@ def init_signals(self): signal.signal(s, self.signal) def signal(self, sig, frame): - if len(self.SIG_QUEUE) < 5: - self.SIG_QUEUE.append(sig) - self.wakeup() + """ Note: Signal handler! No logging allowed. """ + self.SIG_QUEUE.put(sig) def run(self): "Main master loop." @@ -199,26 +187,23 @@ def run(self): while True: self.maybe_promote_master() - sig = self.SIG_QUEUE.pop(0) if self.SIG_QUEUE else None - if sig is None: - self.sleep() - self.murder_workers() - self.manage_workers() - continue - - if sig not in self.SIG_NAMES: - self.log.info("Ignoring unknown signal: %s", sig) - continue + try: + sig = self.SIG_QUEUE.get(timeout=1) + except queue.Empty: + sig = None + + if sig: + signame = self.SIG_NAMES.get(sig) + handler = getattr(self, "handle_%s" % signame, None) + if not handler: + self.log.error("Unhandled signal: %s", signame) + continue + if sig != signal.SIGCHLD: + self.log.info("Handling signal: %s", signame) + handler() - signame = self.SIG_NAMES.get(sig) - handler = getattr(self, "handle_%s" % signame, None) - if not handler: - self.log.error("Unhandled signal: %s", signame) - continue - if sig != signal.SIGCHLD: - self.log.info("Handling signal: %s", signame) - handler() - self.wakeup() + self.murder_workers() + self.manage_workers() except (StopIteration, KeyboardInterrupt): self.halt() except HaltServer as inst: @@ -322,16 +307,6 @@ def maybe_promote_master(self): # reset proctitle util._setproctitle("master [%s]" % self.proc_name) - def wakeup(self): - """\ - Wake up the arbiter by writing to the PIPE - """ - try: - os.write(self.PIPE[1], b'.') - except OSError as e: - if e.errno not in [errno.EAGAIN, errno.EINTR]: - raise - def halt(self, reason=None, exit_status=0): """ halt arbiter """ self.stop() @@ -346,25 +321,6 @@ def halt(self, reason=None, exit_status=0): self.cfg.on_exit(self) sys.exit(exit_status) - def sleep(self): - """\ - Sleep until PIPE is readable or we timeout. - A readable PIPE means a signal occurred. - """ - try: - ready = select.select([self.PIPE[0]], [], [], 1.0) - if not ready[0]: - return - while os.read(self.PIPE[0], 1): - pass - except OSError as e: - # TODO: select.error is a subclass of OSError since Python 3.3. - error_number = getattr(e, 'errno', e.args[0]) - if error_number not in [errno.EAGAIN, errno.EINTR]: - raise - except KeyboardInterrupt: - sys.exit() - def stop(self, graceful=True): """\ Stop workers @@ -387,11 +343,9 @@ def stop(self, graceful=True): # instruct the workers to exit self.kill_workers(sig) # wait until the graceful timeout - while True: - self.reap_workers() - if not self.WORKERS or time.time() >= limit: - break + while self.WORKERS and time.time() < limit: time.sleep(0.1) + self.reap_workers() self.kill_workers(signal.SIGKILL) From 052448a64fb6f4cd54e757647e958ff889554211 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Thu, 1 Feb 2024 22:19:05 +0100 Subject: [PATCH 3/6] arbiter: Use waitpid() facilities to handle worker exit status This change is meant to handle the return value of waitpid() in a way that is more in line with the man page of said syscall. The changes can be summarized as follows: * Use os.WIFEXITED and os.WIFSIGNALED to determine what caused waitpid() to return, and exactly how a worker may have exited. * In case of normal termination, use os.WEXITSTATUS() to read the exit status (instead of using a hand rolled bit shift). A redundant log was removed in this code path. * In case of termination by a signal, use os.WTERMSIG() to determine the signal which caused the worker to terminate. This was buggy before, since the WCOREFLAG (0x80) could cause e.g. a SIGSEGV (code 11) to be reported as "code 139", meaning "code (0x80 | 11)". * Since waitpid() isn't called with WSTOPPED nor WCONTINUED, there's no need to have any os.WIFSTOPPED or os.WIFCONTINUED handling. --- gunicorn/arbiter.py | 58 ++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index 3aa31c980..b4fc1808d 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -470,44 +470,38 @@ def reap_workers(self): break if self.reexec_pid == wpid: self.reexec_pid = 0 - else: - # A worker was terminated. If the termination reason was - # that it could not boot, we'll shut it down to avoid - # infinite start/stop cycles. - exitcode = status >> 8 - if exitcode != 0: - self.log.error('Worker (pid:%s) exited with code %s', wpid, exitcode) + continue + + if os.WIFEXITED(status): + # A worker was normally terminated. If the termination + # reason was that it could not boot, we'll halt the server + # to avoid infinite start/stop cycles. + exitcode = os.WEXITSTATUS(status) + log = self.log.error if exitcode != 0 else self.log.debug + log('Worker (pid:%s) exited with code %s', wpid, exitcode) if exitcode == self.WORKER_BOOT_ERROR: reason = "Worker failed to boot." raise HaltServer(reason, self.WORKER_BOOT_ERROR) if exitcode == self.APP_LOAD_ERROR: reason = "App failed to load." raise HaltServer(reason, self.APP_LOAD_ERROR) - - if exitcode > 0: - # If the exit code of the worker is greater than 0, - # let the user know. - self.log.error("Worker (pid:%s) exited with code %s.", - wpid, exitcode) - elif status > 0: - # If the exit code of the worker is 0 and the status - # is greater than 0, then it was most likely killed - # via a signal. - try: - sig_name = signal.Signals(status).name - except ValueError: - sig_name = "code {}".format(status) - msg = "Worker (pid:{}) was sent {}!".format( - wpid, sig_name) - - # Additional hint for SIGKILL - if status == signal.SIGKILL: - msg += " Perhaps out of memory?" - self.log.error(msg) - - worker = self.WORKERS.pop(wpid, None) - if not worker: - continue + elif os.WIFSIGNALED(status): + # A worker was terminated by a signal. + sig = os.WTERMSIG(status) + try: + sig_name = signal.Signals(sig).name + except ValueError: + sig_name = "signal {}".format(sig) + msg = "Worker (pid:{}) was terminated by {}!".format( + wpid, sig_name) + + # Additional hint for SIGKILL + if sig == signal.SIGKILL: + msg += " Perhaps out of memory?" + self.log.error(msg) + + worker = self.WORKERS.pop(wpid, None) + if worker: worker.tmp.close() self.cfg.child_exit(self, worker) except OSError as e: From b3db5b90a23806ac413dec800f3c27e22c8fe848 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Sat, 3 Feb 2024 17:08:16 +0100 Subject: [PATCH 4/6] arbiter: Reinstall SIGCHLD as required by some UNIXes According to the python signal documentation[1], SIGCHLD is handled differently from other signals. Specifically, if the underlying implementation resets the SIGCHLD signal handler, then python won't reinstall it (as it does for other signals). This behavior doesn't seem to exist for neither Linux nor Mac, but perhaps one could argue that it's good practise anyway. [1] https://docs.python.org/3/library/signal.html --- gunicorn/arbiter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index b4fc1808d..efc6769ba 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -176,6 +176,10 @@ def signal(self, sig, frame): """ Note: Signal handler! No logging allowed. """ self.SIG_QUEUE.put(sig) + # Some UNIXes require SIGCHLD to be reinstalled, see python signal docs + if sig == signal.SIGCHLD: + signal.signal(sig, self.signal) + def run(self): "Main master loop." self.start() From 64387d1715d9da1c19d427e51834df2dfdd52201 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Sun, 4 Feb 2024 23:06:54 +0100 Subject: [PATCH 5/6] arbiter: clean up main loop * Look up handlers in __init__() to induce run-time error early on if something is wrong. * Since we now know that all handlers exist, we can simplify the main loop in arbiter, in such a way that we don't need to call wakeup(). So after this commit, the pipe in arbiter is only used to deliver which signal was sent. --- gunicorn/arbiter.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index efc6769ba..256cba66f 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -41,7 +41,6 @@ class Arbiter: SIG_QUEUE = queue.SimpleQueue() SIGNALS = [getattr(signal.Signals, "SIG%s" % x) for x in "CHLD HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()] - SIG_NAMES = dict((sig, sig.name[3:].lower()) for sig in SIGNALS) def __init__(self, app): os.environ["SERVER_SOFTWARE"] = SERVER_SOFTWARE @@ -71,6 +70,11 @@ def __init__(self, app): 0: sys.executable } + self.SIG_HANDLERS = dict( + (sig, getattr(self, "handle_%s" % sig.name[3:].lower())) + for sig in self.SIGNALS + ) + def _get_num_workers(self): return self._num_workers @@ -193,18 +197,11 @@ def run(self): try: sig = self.SIG_QUEUE.get(timeout=1) - except queue.Empty: - sig = None - - if sig: - signame = self.SIG_NAMES.get(sig) - handler = getattr(self, "handle_%s" % signame, None) - if not handler: - self.log.error("Unhandled signal: %s", signame) - continue if sig != signal.SIGCHLD: - self.log.info("Handling signal: %s", signame) - handler() + self.log.info("Handling signal: %s", signal.Signals(sig).name) + self.SIG_HANDLERS[sig]() + except queue.Empty: + pass self.murder_workers() self.manage_workers() From 7ecea2d54437ce9025aacd7f52a361d96cab4125 Mon Sep 17 00:00:00 2001 From: Richard Eklycke Date: Sun, 18 Aug 2024 12:26:37 +0200 Subject: [PATCH 6/6] arbiter: Add Arbiter:wakeup() method It accepts an optional "due_to_signal" argument which can be used to tell if the wakeup was made because a signal handler needs to be executed or not. --- gunicorn/arbiter.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py index 256cba66f..b918e6e80 100644 --- a/gunicorn/arbiter.py +++ b/gunicorn/arbiter.py @@ -41,6 +41,7 @@ class Arbiter: SIG_QUEUE = queue.SimpleQueue() SIGNALS = [getattr(signal.Signals, "SIG%s" % x) for x in "CHLD HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()] + WAKEUP_REQUEST = signal.NSIG def __init__(self, app): os.environ["SERVER_SOFTWARE"] = SERVER_SOFTWARE @@ -178,12 +179,18 @@ def init_signals(self): def signal(self, sig, frame): """ Note: Signal handler! No logging allowed. """ - self.SIG_QUEUE.put(sig) + self.wakeup(due_to_signal=sig) # Some UNIXes require SIGCHLD to be reinstalled, see python signal docs if sig == signal.SIGCHLD: signal.signal(sig, self.signal) + def wakeup(self, due_to_signal=None): + """\ + Wake up the main master loop. + """ + self.SIG_QUEUE.put(due_to_signal or self.WAKEUP_REQUEST) + def run(self): "Main master loop." self.start() @@ -197,9 +204,10 @@ def run(self): try: sig = self.SIG_QUEUE.get(timeout=1) - if sig != signal.SIGCHLD: - self.log.info("Handling signal: %s", signal.Signals(sig).name) - self.SIG_HANDLERS[sig]() + if sig != self.WAKEUP_REQUEST: + if sig != signal.SIGCHLD: + self.log.info("Handling signal: %s", signal.Signals(sig).name) + self.SIG_HANDLERS[sig]() except queue.Empty: pass