edgi-govdata-archiving · Mr0grog · Jan 1, 2021 · Dec 14, 2020 · Dec 14, 2020 · Dec 14, 2020
diff --git a/.env.example b/.env.example
@@ -1,22 +1,15 @@
-# All environmental variables are optional. The associated settings can be
-# updated at runtime. These merely provide defaults at import time.
+# All environment variables are optional. The associated settings can be
+# updated at runtime. These provide a friendly interface to changing settings
+# when running commands from a CLI.
 
+# Diff-Related variables --------------------------
 
-# These are used in the web_monitoring.db module.
-# They can point to a local deployment of web-monitoring-db (as in this
-# example) or to the production deployment.
-export WEB_MONITORING_DB_URL="http://localhost:3000"
-export WEB_MONITORING_DB_EMAIL="seed-admin@example.com"
-export WEB_MONITORING_DB_PASSWORD="PASSWORD"
-export WEB_MONITORING_APP_ENV="development"  # or production or test
-
-# These are used in test_html_diff
-export WEB_MONITORING_DB_STAGING_URL="https://api-staging.monitoring.envirodatagov.org"
-export WEB_MONITORING_DB_STAGING_EMAIL=""
-export WEB_MONITORING_DB_STAGING_PASSWORD=""
+# These CSS color values are used to set the colors in html_diff_render, differs and links_diff
+# export DIFFER_COLOR_INSERTION="#4dac26"
+# export DIFFER_COLOR_DELETION="#d01c8b"
 
 
-# Diff-Related variables --------------------------
+# Server-Related variables ------------------------
 
 # Set the diffing server to debug mode. Returns tracebacks in error responses
 # and auto-reloads the server when source files change.
@@ -38,13 +31,9 @@ export DIFFER_MAX_BODY_SIZE='10485760' # 10 MB
 # https:// requests have invalid certificates.
 # export VALIDATE_TARGET_CERTIFICATES="false"
 
-# These CSS color values are used to set the colors in html_diff_render, differs and links_diff
-# export DIFFER_COLOR_INSERTION="#4dac26"
-# export DIFFER_COLOR_DELETION="#d01c8b"
-
 # Set how many diffs can be run in parallel.
 # export DIFFER_PARALLELISM=10
 
-# Uncomment to enable logging. Set the level as any normal level.
-# https://docs.python.org/3.6/library/logging.html#logging-levels
-# export LOG_LEVEL=INFO
+# Instead of crashing when the process pool used for running diffs breaks,
+# keep accepting requests and try to restart the pool.
+# RESTART_BROKEN_DIFFER='true'
diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst
@@ -2,6 +2,12 @@
 Release History
 ===============
 
+In Development
+--------------
+
+- The server uses a pool of child processes to run diffs. If the pool breaks while running a diff, it will be re-created once, and, if it fails again, the server will now crash with an exit code of ``10``. (An external process manager like Supervisor, Kubernetes, etc. can then decide how to handle the situation.) Previously, the diff would fail at this point, but server would try to re-create the process pool again the next time a diff was requested. You can opt-in to the old behavior by setting the ``RESTART_BROKEN_DIFFER`` environment variable to ``true``. (`#49 <https://github.com/edgi-govdata-archiving/web-monitoring-diff/issues/49>`_)
+
+
 Version 0.1.1 (2020-11-24)
 --------------------------
 

diff --git a/web_monitoring_diff/server/server.py b/web_monitoring_diff/server/server.py
@@ -36,6 +36,7 @@
 sentry_sdk.integrations.logging.ignore_logger('tornado.access')
 
 DIFFER_PARALLELISM = int(os.environ.get('DIFFER_PARALLELISM', 10))
+RESTART_BROKEN_DIFFER = os.environ.get('RESTART_BROKEN_DIFFER', 'False').strip().lower() == 'true'
 
 # Map tokens in the REST API to functions in modules.
 # The modules do not have to be part of the web_monitoring_diff package.
@@ -208,6 +209,7 @@ def initialize_diff_worker():
 
 class DiffServer(tornado.web.Application):
     terminating = False
+    server = None
 
     def listen(self, port, address='', **kwargs):
         self.server = super().listen(port, address, **kwargs)
@@ -220,9 +222,11 @@ async def shutdown(self, immediate=False):
         Otherwise, diffs are allowed to try and finish.
         """
         self.terminating = True
-        self.server.stop()
+        if self.server:
+            self.server.stop()
         await self.shutdown_differs(immediate)
-        await self.server.close_all_connections()
+        if self.server:
+            await self.server.close_all_connections()
 
     async def shutdown_differs(self, immediate=False):
         """Stop all child processes used for running diffs."""
@@ -238,6 +242,12 @@ async def shutdown_differs(self, immediate=False):
             else:
                 await shutdown_executor_in_loop(differs)
 
+    async def quit(self, immediate=False, code=0):
+        await self.shutdown(immediate=immediate)
+        tornado.ioloop.IOLoop.current().stop()
+        if code:
+            sys.exit(code)
+
     def handle_signal(self, signal_type, frame):
         """Handle a signal by shutting down the application and IO loop."""
         loop = tornado.ioloop.IOLoop.current()
@@ -515,6 +525,16 @@ async def diff(self, func, a, b, params, tries=2):
                     if executor == old_executor:
                         executor = self.get_diff_executor(reset=True)
                 else:
+                    # If we shouldn't allow the server to keep rebuilding the
+                    # differ pool for new requests, schedule a shutdown.
+                    # (*Schuduled* so that current requests have a chance to
+                    # complete with an error.)
+                    if not RESTART_BROKEN_DIFFER:
+                        logger.error('Process pool for diffing has failed too '
+                                     'many times; quitting server...')
+                        tornado.ioloop.IOLoop.current().add_callback(
+                            self.application.quit,
+                            code=10)
                     raise
 
     # NOTE: this doesn't do anything async, but if we change it to do so, we

diff --git a/web_monitoring_diff/tests/test_server_exc_handling.py b/web_monitoring_diff/tests/test_server_exc_handling.py
@@ -393,6 +393,9 @@ class BrokenProcessPoolExecutor(concurrent.futures.Executor):
     "Fake process pool that only raises BrokenProcessPool exceptions."
     submit_count = 0
 
+    def __init__(max_workers=None, mp_context=None, initializer=None, initargs=()):
+        return super().__init__()
+
     def submit(self, fn, *args, **kwargs):
         self.submit_count += 1
         result = concurrent.futures.Future()
@@ -403,10 +406,10 @@ def submit(self, fn, *args, **kwargs):
 
 
 class ExecutionPoolTestCase(DiffingServerTestCase):
-    def fetch_async(self, path, **kwargs):
+    def fetch_async(self, path, raise_error=False, **kwargs):
         "Like AyncHTTPTestCase.fetch, but async."
         url = self.get_url(path)
-        return self.http_client.fetch(url, **kwargs)
+        return self.http_client.fetch(url, raise_error=raise_error, **kwargs)
 
     def test_rebuilds_process_pool_when_broken(self):
         # Get a custom executor that will always fail the first time, but get
@@ -427,7 +430,8 @@ def get_executor(self, reset=False):
             assert response.code == 200
             assert did_get_executor == True
 
-    def test_diff_returns_error_if_process_pool_repeatedly_breaks(self):
+    @patch.object(df.DiffServer, "quit")
+    def test_diff_returns_error_if_process_pool_repeatedly_breaks(self, _):
         # Set a custom executor that will always fail.
         def get_executor(self, reset=False):
             return BrokenProcessPoolExecutor()
@@ -474,6 +478,46 @@ def get_executor(self, reset=False):
             # have one reset because only one request hit the bad executor.
             assert bad_executor.submit_count == 2
 
+    # NOTE: the real `quit` tears up the server, which causes porblems with
+    # the test, so we just mock it and test that it was called, rather than
+    # checking whether `sys.exit` was ultimately called. Not totally ideal,
+    # but better than no testing.
+    @patch.object(df.DiffServer, "quit")
+    def test_server_exits_if_process_pool_repeatedly_breaks(self, mock_quit):
+        # Set a custom executor that will always fail.
+        def get_executor(self, reset=False):
+            return BrokenProcessPoolExecutor()
+
+        with patch.object(df.DiffHandler, 'get_diff_executor', get_executor):
+            response = self.fetch('/html_source_dmp?format=json&'
+                                  f'a=file://{fixture_path("empty.txt")}&'
+                                  f'b=file://{fixture_path("empty.txt")}')
+            self.json_check(response)
+            assert response.code == 500
+
+        assert mock_quit.called
+        assert mock_quit.call_args == ({'code': 10},)
+
+    # NOTE: the real `quit` tears up the server, which causes porblems with
+    # the test, so we just mock it and test that it was called, rather than
+    # checking whether `sys.exit` was ultimately called. Not totally ideal,
+    # but better than no testing.
+    @patch.object(df.DiffServer, "quit")
+    @patch('web_monitoring_diff.server.server.RESTART_BROKEN_DIFFER', True)
+    def test_server_does_not_exit_if_env_var_set_when_process_pool_repeatedly_breaks(self, mock_quit):
+        # Set a custom executor that will always fail.
+        def get_executor(self, reset=False):
+            return BrokenProcessPoolExecutor()
+
+        with patch.object(df.DiffHandler, 'get_diff_executor', get_executor):
+            response = self.fetch('/html_source_dmp?format=json&'
+                                  f'a=file://{fixture_path("empty.txt")}&'
+                                  f'b=file://{fixture_path("empty.txt")}')
+            self.json_check(response)
+            assert response.code == 500
+
+        assert not mock_quit.called
+
 
 def mock_diffing_method(c_body):
     return