From 0af511557d81f6424d75f88a459c172a8174f5b4 Mon Sep 17 00:00:00 2001
From: Richard Eklycke <richard@eklycke.se>
Date: Thu, 1 Feb 2024 22:19:05 +0100
Subject: [PATCH] arbiter: Use waitpid() facilities to handle worker exit
 status

This change is meant to handle the return value of waitpid() in a way
that is more in line with the man page of said syscall. The changes
can be summarized as follows:

* Use os.WIFEXITED and os.WIFSIGNALED to determine what caused
  waitpid() to return, and exactly how a worker may have exited.

* In case of normal termination, use os.WEXITSTATUS() to read the exit
  status (instead of using a hand rolled bit shift). A redundant log
  was removed in this code path.

* In case of termination by a signal, use os.WTERMSIG() to determine
  the signal which caused the worker to terminate. This was buggy
  before, since the WCOREFLAG (0x80) could cause e.g. a SIGSEGV (code
  11) to be reported as "code 139", meaning "code (0x80 | 11)".

* Since waitpid() isn't called with WSTOPPED nor WCONTINUED, there's
  no need to have any os.WIFSTOPPED or os.WIFCONTINUED handling.
---
 gunicorn/arbiter.py | 62 +++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/gunicorn/arbiter.py b/gunicorn/arbiter.py
index da94b4360..a9724cc6c 100644
--- a/gunicorn/arbiter.py
+++ b/gunicorn/arbiter.py
@@ -522,46 +522,42 @@ def reap_workers(self):
                     break
                 if self.reexec_pid == wpid:
                     self.reexec_pid = 0
-                else:
-                    # A worker was terminated. If the termination reason was
-                    # that it could not boot, we'll shut it down to avoid
-                    # infinite start/stop cycles.
-                    exitcode = status >> 8
+                    continue
+
+                if os.WIFEXITED(status):
+                    # A worker was normally terminated. If the termination
+                    # reason was that it could not boot, we'll halt the server
+                    # to avoid infinite start/stop cycles.
+                    exitcode = os.WEXITSTATUS(status)
                     if exitcode != 0:
-                        self.log.error('Worker (pid:%s) exited with code %s', wpid, exitcode)
+                        self.log.error('Worker (pid:%s) exited with code %s',
+                                       wpid, exitcode)
                     if exitcode == self.WORKER_BOOT_ERROR:
                         reason = "Worker failed to boot."
                         raise HaltServer(reason, self.WORKER_BOOT_ERROR)
                     if exitcode == self.APP_LOAD_ERROR:
                         reason = "App failed to load."
                         raise HaltServer(reason, self.APP_LOAD_ERROR)
-
-                    if exitcode > 0:
-                        # If the exit code of the worker is greater than 0,
-                        # let the user know.
-                        self.log.error("Worker (pid:%s) exited with code %s.",
-                                       wpid, exitcode)
-                    elif status > 0:
-                        # If the exit code of the worker is 0 and the status
-                        # is greater than 0, then it was most likely killed
-                        # via a signal.
-                        try:
-                            sig_name = signal.Signals(status).name
-                        except ValueError:
-                            sig_name = "code {}".format(status)
-                        msg = "Worker (pid:{}) was sent {}!".format(
-                            wpid, sig_name)
-
-                        # Additional hint for SIGKILL
-                        if status == signal.SIGKILL:
-                            msg += " Perhaps out of memory?"
-                        self.log.error(msg)
-
-                    worker = self.WORKERS.pop(wpid, None)
-                    if not worker:
-                        continue
-                    worker.tmp.close()
-                    self.cfg.child_exit(self, worker)
+                elif os.WIFSIGNALED(status):
+                    # A worker was terminated by a signal.
+                    sig = os.WTERMSIG(status)
+                    try:
+                        sig_name = signal.Signals(sig).name
+                    except ValueError:
+                        sig_name = "signal {}".format(sig)
+                    msg = "Worker (pid:{}) was terminated by {}!".format(
+                        wpid, sig_name)
+
+                    # Additional hint for SIGKILL
+                    if sig == signal.SIGKILL:
+                        msg += " Perhaps out of memory?"
+                    self.log.error(msg)
+
+                worker = self.WORKERS.pop(wpid, None)
+                if not worker:
+                    continue
+                worker.tmp.close()
+                self.cfg.child_exit(self, worker)
         except OSError as e:
             if e.errno != errno.ECHILD:
                 raise