From d5c91cfbd96dd5ca83c7c9e1d6ad502ef95e107f Mon Sep 17 00:00:00 2001 From: Manuel Reinhardt Date: Mon, 5 Aug 2024 13:47:56 +0200 Subject: [PATCH] feat: Shortcut in safe_html Check for signs of html or script, skip further processing if none are found. Saves processing time for lxml parsing and manipulation. --- Products/PortalTransforms/transforms/safe_html.py | 9 +++++++++ news/66.feature | 1 + 2 files changed, 10 insertions(+) create mode 100644 news/66.feature diff --git a/Products/PortalTransforms/transforms/safe_html.py b/Products/PortalTransforms/transforms/safe_html.py index 787e7e1..fc25179 100644 --- a/Products/PortalTransforms/transforms/safe_html.py +++ b/Products/PortalTransforms/transforms/safe_html.py @@ -4,6 +4,7 @@ from lxml_html_clean import Cleaner from plone.base.interfaces import IFilterSchema from plone.base.utils import safe_bytes +from plone.base.utils import safe_text from plone.registry.interfaces import IRegistry from Products.PortalTransforms.interfaces import ITransform from Products.PortalTransforms.libtransforms.utils import bodyfinder @@ -183,6 +184,14 @@ def cleaner_options(self): return options def scrub_html(self, orig): + orig_text = safe_text(orig) + # short cut if no html or script is detected + if not orig or not ( + hasScript(orig_text) + or "<" in orig_text + or any((entity in orig_text for entity in html5entities.values())) + ): + return orig_text # append html tag to create a dummy parent for the tree html_parser = html.HTMLParser(encoding="utf-8") orig = safe_bytes(orig) diff --git a/news/66.feature b/news/66.feature new file mode 100644 index 0000000..c07a4bb --- /dev/null +++ b/news/66.feature @@ -0,0 +1 @@ +Shortcut in safe_html: Check for signs of html or script, skip further processing if none are found.