From 845f97ef1690697b015959b930aa5e8a8a93df40 Mon Sep 17 00:00:00 2001 From: Steve Jalim Date: Fri, 22 Nov 2024 10:20:14 +0000 Subject: [PATCH] Add a cache-baced 'lookahead' to know whether it's worth hitting the CMS for a page ...because if the page isn't in the lookahead, we can avoid touching the DB just to ultimately return a 404 --- bedrock/cms/apps.py | 16 ++++ bedrock/cms/bedrock_wagtail_urls.py | 34 ++++++++ bedrock/cms/decorators.py | 30 +++++++ bedrock/cms/signal_handlers.py | 99 +++++++++++++++++++++ bedrock/cms/tests/conftest.py | 25 ++++++ bedrock/cms/tests/test_decorators.py | 58 ++++++++++++- bedrock/cms/tests/test_utils.py | 124 ++++++++++++++++++++++++++- bedrock/cms/utils.py | 60 +++++++++++++ bedrock/settings/base.py | 2 + bedrock/urls.py | 7 +- 10 files changed, 450 insertions(+), 5 deletions(-) create mode 100644 bedrock/cms/bedrock_wagtail_urls.py create mode 100644 bedrock/cms/signal_handlers.py diff --git a/bedrock/cms/apps.py b/bedrock/cms/apps.py index a1f57a449c1..7f696ab955a 100644 --- a/bedrock/cms/apps.py +++ b/bedrock/cms/apps.py @@ -3,8 +3,24 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. from django.apps import AppConfig +from django.db import connection +from django.db.models.signals import post_migrate class CmsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "bedrock.cms" + + def ready(self): + import bedrock.cms.signal_handlers # noqa: F401 + from bedrock.cms.utils import warm_page_path_cache + + if "wagtailcore_page" in connection.introspection.table_names(): + # The route to take if the DB already exists in a viable state + warm_page_path_cache() + else: + # The route to take the DB isn't ready yet (eg tests or an empty DB) + post_migrate.connect( + bedrock.cms.signal_handlers.trigger_cache_warming, + sender=self, + ) diff --git a/bedrock/cms/bedrock_wagtail_urls.py b/bedrock/cms/bedrock_wagtail_urls.py new file mode 100644 index 00000000000..e1bcd228859 --- /dev/null +++ b/bedrock/cms/bedrock_wagtail_urls.py @@ -0,0 +1,34 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +# Custom version of wagtail_urls that wraps the wagtail_serve route +# with a decorator that does a lookahead to see if Wagtail will 404 or not +# (based on a precomputed cache of URLs in the CMS) + +from django.urls import re_path + +from wagtail.urls import urlpatterns as wagtail_urlpatterns +from wagtail.views import serve as wagtail_serve + +from bedrock.cms.decorators import pre_check_for_cms_404 + +# Modify the wagtail_urlpatterns to replace `wagtail_serve` with a decorated +# version of the same view, so we can pre-empt Wagtail looking up a page +# that we know will be a 404 +custom_wagtail_urls = [] + +for pattern in wagtail_urlpatterns: + if hasattr(pattern.callback, "__name__") and pattern.callback.__name__ == "serve": + custom_wagtail_urls.append( + re_path( + # This is a RegexPattern not a RoutePattern, which is why we use re_path not path + pattern.pattern, + pre_check_for_cms_404(wagtail_serve), + name=pattern.name, + ) + ) + else: + custom_wagtail_urls.append(pattern) + +# Note: custom_wagtail_urls is imported into the main project urls.py instead of wagtail_urls diff --git a/bedrock/cms/decorators.py b/bedrock/cms/decorators.py index 7321795507a..04b4ff6aef0 100644 --- a/bedrock/cms/decorators.py +++ b/bedrock/cms/decorators.py @@ -2,17 +2,23 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. +import logging from functools import wraps +from django.conf import settings from django.http import Http404 from wagtail.views import serve as wagtail_serve from bedrock.base.i18n import remove_lang_prefix +from bedrock.cms.utils import path_exists_in_cms from lib.l10n_utils.fluent import get_active_locales from .utils import get_cms_locales_for_path +logger = logging.getLogger(__name__) + + HTTP_200_OK = 200 @@ -176,3 +182,27 @@ def wrapped_view(request, *args, **kwargs): else: # Otherwise, apply the decorator directly to view_func return decorator(view_func) + + +def pre_check_for_cms_404(view): + """ + Decorator intended to avoid going through the Wagtail's serve view + for a route that we know will be a 404. How do we know? We have a + pre-warmed cache of all the pages of _live_ pages known to Wagtail + - see bedrock.cms.utils for that. + + This decorator can be skipped if settings.CMS_DO_PAGE_PATH_PRECHECK is + set to False via env vars. + """ + + def wrapped_view(request, *args, **kwargs): + _path_to_check = request.path + if settings.CMS_DO_PAGE_PATH_PRECHECK: + if not path_exists_in_cms(_path_to_check): + logger.info(f"Raising early 404 for {_path_to_check} because it doesn't exist in the CMS") + raise Http404 + + # Proceed to the original view + return view(request, *args, **kwargs) + + return wrapped_view diff --git a/bedrock/cms/signal_handlers.py b/bedrock/cms/signal_handlers.py new file mode 100644 index 00000000000..31792dfb5ee --- /dev/null +++ b/bedrock/cms/signal_handlers.py @@ -0,0 +1,99 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import logging +from typing import TYPE_CHECKING, Type + +from django.conf import settings +from django.contrib.auth import get_user_model +from django.core.mail import send_mail +from django.template.loader import render_to_string + +from wagtail.signals import page_published, page_unpublished, post_page_move +from wagtail_localize_smartling.signals import translation_imported + +from bedrock.cms.utils import warm_page_path_cache + +if TYPE_CHECKING: + from wagtail_localize.models import Translation + from wagtail_localize_smartling.models import Job + + +logger = logging.getLogger(__name__) + + +def notify_of_imported_translation( + sender: Type["Job"], + instance: "Job", + translation: "Translation", + **kwargs, +): + """ + Signal handler for receiving news that a translation has landed from + Smartling. + + For now, sends a notification email to all Admins + """ + UserModel = get_user_model() + + admin_emails = UserModel.objects.filter( + is_superuser=True, + is_active=True, + ).values_list("email", flat=True) + admin_emails = [email for email in admin_emails if email] # Safety check to ensure no empty email addresses are included + + if not admin_emails: + logger.warning("Unable to send translation-imported email alerts: no admins in system") + return + + email_subject = "New translations imported into Bedrock CMS" + + job_name = instance.name + translation_source_name = str(instance.translation_source.get_source_instance()) + + smartling_cms_dashboard_url = f"{settings.WAGTAILADMIN_BASE_URL}/cms-admin/smartling-jobs/inspect/{instance.pk}/" + + email_body = render_to_string( + template_name="cms/email/notifications/translation_imported__body.txt", + context={ + "job_name": job_name, + "translation_source_name": translation_source_name, + "translation_target_language_code": translation.target_locale.language_code, + "smartling_cms_dashboard_url": smartling_cms_dashboard_url, + }, + ) + + send_mail( + subject=email_subject, + message=email_body, + from_email=settings.DEFAULT_FROM_EMAIL, + recipient_list=admin_emails, + ) + logger.info(f"Translation-imported notification sent to {len(admin_emails)} admins") + + +translation_imported.connect(notify_of_imported_translation, weak=False) + + +def trigger_cache_warming(sender, **kwargs): + # Run after the post-migrate signal is sent for the `cms` app + warm_page_path_cache() + + +def rebuild_path_cache_after_page_move(sender, **kwargs): + # Check if a page has moved up or down within the tree + # (rather than just being reordered). If it has really moved + # then we should update the cache + if kwargs["url_path_before"] == kwargs["url_path_after"]: + # No URLs are changing - nothing to do here! + return + + # The page is moving, so we need to rebuild the entire pre-empting-lookup cache + warm_page_path_cache() + + +post_page_move.connect(rebuild_path_cache_after_page_move) + +page_published.connect(trigger_cache_warming) +page_unpublished.connect(trigger_cache_warming) diff --git a/bedrock/cms/tests/conftest.py b/bedrock/cms/tests/conftest.py index 3142f05febf..ad555673c1f 100644 --- a/bedrock/cms/tests/conftest.py +++ b/bedrock/cms/tests/conftest.py @@ -4,6 +4,7 @@ import pytest import wagtail_factories +from wagtail.contrib.redirects.models import Redirect from wagtail.models import Locale, Site from bedrock.cms.tests.factories import LocaleFactory, SimpleRichTextPageFactory @@ -74,8 +75,14 @@ def tiny_localized_site(): site = Site.objects.get(is_default_site=True) en_us_root_page = site.root_page + fr_root_page = en_us_root_page.copy_for_translation(fr_locale) + rev = fr_root_page.save_revision() + fr_root_page.publish(rev) + pt_br_root_page = en_us_root_page.copy_for_translation(pt_br_locale) + rev = pt_br_root_page.save_revision() + pt_br_root_page.publish(rev) en_us_homepage = SimpleRichTextPageFactory( title="Test Page", @@ -148,3 +155,21 @@ def tiny_localized_site(): assert fr_homepage.live is True assert fr_child.live is True assert fr_grandchild.live is True + + +@pytest.fixture +def tiny_localized_site_redirects(): + """Some test redirects that complement the tiny_localized_site fixture. + + Useful for things like the tests for the cache-based lookup + in bedrock.cms.tests.test_utils.test_path_exists_in_cms + """ + + Redirect.add_redirect( + old_path="/fr/moved-page/", + redirect_to="/fr/test-page/child-page/", + ) + Redirect.add_redirect( + old_path="/en-US/deeper/nested/moved-page/", + redirect_to="/fr/test-page/", + ) diff --git a/bedrock/cms/tests/test_decorators.py b/bedrock/cms/tests/test_decorators.py index 6905ab7bfb9..5cfdaa1da75 100644 --- a/bedrock/cms/tests/test_decorators.py +++ b/bedrock/cms/tests/test_decorators.py @@ -3,13 +3,14 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. +from django.http import Http404, HttpResponse from django.urls import path import pytest from wagtail.rich_text import RichText from bedrock.base.i18n import bedrock_i18n_patterns -from bedrock.cms.decorators import prefer_cms +from bedrock.cms.decorators import pre_check_for_cms_404, prefer_cms from bedrock.cms.tests import decorator_test_views from bedrock.urls import urlpatterns as mozorg_urlpatterns @@ -448,3 +449,58 @@ def test_prefer_cms_rejects_invalid_setup(mocker, config, expect_exeption): prefer_cms(view_func=fake_view, **config) else: prefer_cms(view_func=fake_view, **config) + + +def dummy_view(request, *args, **kwargs): + return HttpResponse("Hello, world!") + + +@pytest.mark.parametrize("pretend_that_path_exists", [True, False]) +def test_pre_check_for_cms_404( + pretend_that_path_exists, + mocker, + rf, + settings, + client, + tiny_localized_site, +): + settings.CMS_DO_PAGE_PATH_PRECHECK = True + mocked_view = mocker.spy(dummy_view, "__call__") # Spy on the test view + mocked_path_exists_in_cms = mocker.patch("bedrock.cms.decorators.path_exists_in_cms") + mocked_path_exists_in_cms.return_value = pretend_that_path_exists + + decorated_view = pre_check_for_cms_404(mocked_view) + request = rf.get("/path/is/irrelevant/because/we/are/mocking/path_exists_in_cms") + + if pretend_that_path_exists: + response = decorated_view(request) + # Assert: Verify the original view was called + mocked_view.assert_called_once_with(request) + assert response.content == b"Hello, world!" + else: + with pytest.raises(Http404): # Expect an Http404 since path_exists_in_cms returns False + decorated_view(request) + mocked_view.assert_not_called() + + +@pytest.mark.parametrize("pretend_that_path_exists", [True, False]) +def test_pre_check_for_cms_404__show_can_be_disabled_with_settings( + pretend_that_path_exists, + mocker, + rf, + settings, + client, + tiny_localized_site, +): + settings.CMS_DO_PAGE_PATH_PRECHECK = False + mocked_view = mocker.spy(dummy_view, "__call__") # Spy on the test view + mocked_path_exists_in_cms = mocker.patch("bedrock.cms.decorators.path_exists_in_cms") + mocked_path_exists_in_cms.return_value = pretend_that_path_exists + + decorated_view = pre_check_for_cms_404(mocked_view) + request = rf.get("/path/is/irrelevant/because/we/are/mocking/path_exists_in_cms") + + # The fake view will always be called because the pre-check isn't being used + response = decorated_view(request) + mocked_view.assert_called_once_with(request) + assert response.content == b"Hello, world!" diff --git a/bedrock/cms/tests/test_utils.py b/bedrock/cms/tests/test_utils.py index f925fa3ce2f..69d775692fb 100644 --- a/bedrock/cms/tests/test_utils.py +++ b/bedrock/cms/tests/test_utils.py @@ -3,11 +3,21 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. +from django.core.cache import cache + import pytest from wagtail.coreutils import get_dummy_request from wagtail.models import Locale, Page -from bedrock.cms.utils import get_cms_locales_for_path, get_locales_for_cms_page, get_page_for_request +from bedrock.cms.utils import ( + BEDROCK_ALL_CMS_PATHS_CACHE_KEY, + _get_all_cms_paths, + get_cms_locales_for_path, + get_locales_for_cms_page, + get_page_for_request, + path_exists_in_cms, + warm_page_path_cache, +) pytestmark = [pytest.mark.django_db] @@ -142,3 +152,115 @@ def test_get_cms_locales_for_path( if get_page_for_request_should_return_a_page: mock_get_page_for_request.assert_called_once_with(request=request) mock_get_locales_for_cms_page.assert_called_once_with(page=page) + + +@pytest.mark.parametrize( + "path, hit_expected", + ( + # Regular path + ("/en-US/test-page/child-page/", True), + # Checking missing trailing slashes + ("/en-US/test-page/child-page", True), + # Checking querystrings don't fox things + ("/en-US/test-page/child-page/?some=querystring", True), + ("/en-US/test-page/child-page/?some=querystring&and=more-stuff", True), + # Checking deeper in the tree + ("/fr/test-page/child-page/grandchild-page/", True), + ("/fr/test-page/child-page/grandchild-page/great-grandchild/", False), + ("/fr/test-page/child-page/grandchild-page/?testing=yes!", True), + ("/fr/test-page/child-page/grandchild-page/?testing=yes!&other=things", True), + # Pages that would 404 + ("/en-US/test-page/not-child-page/", False), + ("/en-US/test-page/not-child-page", False), + ("/fr/grandchild-page/", False), + ("/en-US/not-a-path", False), + ("/en-US/not-a-path/", False), + ("/en-US/", True), + ("/fr/", True), + ("/pt-BR/", True), + ("/pt-BR/test-page/", True), + # Checking paths for redirects are in the cache too + ("/fr/moved-page", True), + ("/en-US/deeper/nested/moved-page", True), + ("/fr/moved-page/", True), # Trailing slash is not part of the redirect + # Confirm that the CMS admin route is not counted as "existing in the CMS" + # (which also means page previews and draftsharing links are unaffected by this lookup) + ("/cms-admin/", False), + # Confirm that some known-only-to-django URLs are not in the page lookup cache + ("/django-admin/", False), + ("/careers/", False), + ), +) +def test_path_exists_in_cms( + client, + tiny_localized_site, + tiny_localized_site_redirects, + path, + hit_expected, +): + cache.delete(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) + assert path_exists_in_cms(path) == hit_expected + + some_django_served_urls = [ + "/cms-admin/", + "/django-admin/", + "/careers/", + ] + + # Also confirm that what would happen without the lookup is what we expect + if ( + hit_expected is False # These are pages that should 404 + and path not in some_django_served_urls # This is not in the URLs the CMS knows about but + ): + assert client.get(path, follow=True).status_code == 404 + else: + # These are pages that should be serveable by Wagtail in some way + if "moved-page" in path: + # The "moved-page" is a route that's been configured as a Redirect + # so will 301 when we get it + resp = client.get(path) + assert resp.status_code == 301 + assert client.get(resp.headers["location"]).status_code == 200 + else: + # These are regular page serves + assert client.get(path, follow=True).status_code == 200 + + +def test_warm_page_path_cache(mocker): + cache.delete(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) + + mock_get_all_cms_paths = mocker.patch("bedrock.cms.utils._get_all_cms_paths") + expected = set(["this", "is a", "test"]) + mock_get_all_cms_paths.return_value = expected + + assert cache.get(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) is None + + warm_page_path_cache() + assert cache.get(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) is expected + + expected_updated = set(["this", "is also a", "test"]) + mock_get_all_cms_paths.return_value = expected_updated + + warm_page_path_cache() + assert cache.get(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) is expected_updated + + +def test__get_all_cms_paths(client, tiny_localized_site, tiny_localized_site_redirects): + expected = set( + [ + "/en-US/", + "/en-US/test-page/", + "/en-US/test-page/child-page/", + "/fr/", + "/fr/test-page/", + "/fr/test-page/child-page/", + "/fr/test-page/child-page/grandchild-page/", + "/pt-BR/", + "/pt-BR/test-page/", + "/pt-BR/test-page/child-page/", + "/fr/moved-page", # No trailing slashes on redirects + "/en-US/deeper/nested/moved-page", # No trailing slashes on redirects + ] + ) + actual = _get_all_cms_paths() + assert actual == expected diff --git a/bedrock/cms/utils.py b/bedrock/cms/utils.py index eed3b69b408..8c1a4c7d9e8 100644 --- a/bedrock/cms/utils.py +++ b/bedrock/cms/utils.py @@ -1,14 +1,23 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. +import logging + +from django.conf import settings +from django.core.cache import cache from django.core.exceptions import ObjectDoesNotExist from django.db.models import Subquery from django.http import Http404 +from wagtail.contrib.redirects.models import Redirect from wagtail.models import Locale, Page from bedrock.base.i18n import split_path_and_normalize_language +logger = logging.getLogger(__name__) + +BEDROCK_ALL_CMS_PATHS_CACHE_KEY = "bedrock_cms_all_known_live_page_paths" + def get_page_for_request(*, request): """For the given HTTPRequest (and its path) find the corresponding Wagtail @@ -61,3 +70,54 @@ def get_cms_locales_for_path(request): locales = get_locales_for_cms_page(page=page) return locales + + +def _get_all_cms_paths() -> set: + """Fetch all the possible URL paths that are available + in Wagtail, in all locales, for LIVE pages only and for + all Redirects + """ + pages = set([x.url for x in Page.objects.live() if x.url is not None]) + redirects = set([x.old_path for x in Redirect.objects.all()]) + + return pages.union(redirects) + + +def path_exists_in_cms(path: str) -> bool: + """ + Using the paths cached via warm_page_path_cache, return a boolean + simply showing whether or not Wagtail can serve the requested path + (whether as a Page or as a Redirect). + + Avoiding Wagtail-raised 404s is the goal of this function. + We don't care if there'll be a chain of redirects or a slash being + auto-appended - we'll let Django/Wagtail handle that. + """ + + cms_paths = cache.get(BEDROCK_ALL_CMS_PATHS_CACHE_KEY) + if cms_paths is None: + cms_paths = warm_page_path_cache() + if "?" in path: + path = path.partition("?")[0] + + if path in cms_paths: + return True + elif not path.endswith("/") and f"{path}/" in cms_paths: + # pages have trailing slashes in their paths, but we might get asked for one without it + return True + elif path.endswith("/") and path[:-1] in cms_paths: + # redirects have no trailing slashes in their paths, but we might get asked for one with it + return True + return False + + +def warm_page_path_cache() -> set(): + paths = _get_all_cms_paths() + logger.info(f"Warming the cache '{BEDROCK_ALL_CMS_PATHS_CACHE_KEY}' with {len(paths)} paths ") + + cache.set( + BEDROCK_ALL_CMS_PATHS_CACHE_KEY, + paths, + settings.CACHE_TIME_LONG, + ) + return paths diff --git a/bedrock/settings/base.py b/bedrock/settings/base.py index 510c1399b61..16586862e7e 100644 --- a/bedrock/settings/base.py +++ b/bedrock/settings/base.py @@ -2439,6 +2439,8 @@ def lazy_wagtail_langs(): CMS_ALLOWED_PAGE_MODELS = _allowed_page_models +CMS_DO_PAGE_PATH_PRECHECK = config("CMS_DO_PAGE_PATH_PRECHECK", default="True", parser=bool) + # Our use of django-waffle relies on the following 2 settings to be set this way so that if a switch # doesn't exist, we get `None` back from `switch_is_active`. WAFFLE_SWITCH_DEFAULT = None diff --git a/bedrock/urls.py b/bedrock/urls.py index cee73330edf..6660936daa8 100644 --- a/bedrock/urls.py +++ b/bedrock/urls.py @@ -8,13 +8,13 @@ from django.utils.module_loading import import_string import wagtaildraftsharing.urls as wagtaildraftsharing_urls -from wagtail import urls as wagtail_urls from wagtail.admin import urls as wagtailadmin_urls from wagtail.documents import urls as wagtaildocs_urls from watchman import views as watchman_views from bedrock.base import views as base_views from bedrock.base.i18n import bedrock_i18n_patterns +from bedrock.cms.bedrock_wagtail_urls import custom_wagtail_urls # The default django 404 and 500 handler doesn't run the ContextProcessors, # which breaks the base template page. So we replace them with views that do! @@ -90,7 +90,8 @@ # Note that statics are handled via Whitenoise's middleware # Wagtail is the catch-all route, and it will raise a 404 if needed. -# Note that we're also using localised URLs here +# We have customised wagtail_urls in order to decorate wagtail_serve +# to 'get ahead' of that 404 raising, to avoid hitting the database. urlpatterns += bedrock_i18n_patterns( - path("", include(wagtail_urls)), + path("", include(custom_wagtail_urls)), )