Add optional HTML cleaner to RichTextContent

Enable this by adding cleanse=True to the create_content_type invocation for your RichTextContent classes.
feincms · Aug 5, 2009 · 0ec8e83 · 0ec8e83
1 parent 58adc6f
commit 0ec8e83
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 0 deletions.
diff --git a/feincms/content/richtext/cleanse.py b/feincms/content/richtext/cleanse.py
@@ -0,0 +1,95 @@
+import lxml.html
+import lxml.html.clean
+import re
+
+
+cleanse_html_allowed = {
+    'a': ('href', 'target', 'title'),
+    'h2': (),
+    'h3': (),
+    'strong': (),
+    'em': (),
+    'p': (),
+    'ul': (),
+    'ol': (),
+    'li': (),
+    'span': (),
+    'br': (),
+    'anything': (),
+    }
+
+cleanse_html_allowed_empty_tags = ('br',)
+
+
+def cleanse_html(html):
+    """
+    Clean HTML code from ugly copy-pasted CSS and empty elements
+
+    Removes everything not explicitly allowed in `cleanse_html_allowed`
+    above.
+    """
+
+    doc = lxml.html.fromstring('<anything>%s</anything>' % html)
+    try:
+        ignore = lxml.html.tostring(doc, encoding=unicode)
+    except UnicodeDecodeError:
+        # fall back to slower BeautifulSoup if parsing failed
+        from lxml.html import soupparser
+        doc = soupparser.fromstring(u'<anything>%s</anything>' % html)
+
+    cleaner = lxml.html.clean.Cleaner(
+        allow_tags=cleanse_html_allowed.keys(),
+        remove_unknown_tags=False, # preserve surrounding 'anything' tag
+        style=False, safe_attrs_only=False, # do not strip out style
+                                            # attributes; we still need
+                                            # the style information to
+                                            # convert spans into em/strong
+                                            # tags
+        )
+    doc = cleaner.clean_html(doc)
+
+    for element in reversed(list(doc.iterdescendants())):
+        # convert span elements into em/strong if a matching style rule
+        # has been found. strong has precedence, strong & em at the same
+        # time is not supported
+        if element.tag == 'span':
+            style = element.attrib.get('style')
+            if style:
+                if 'bold' in style:
+                    element.tag = 'strong'
+                elif 'italic' in style:
+                    element.tag = 'em'
+
+            if element.tag == 'span': # still span
+                element.drop_tag() # remove tag, but preserve children and text
+                continue
+
+        # remove empty tags if they are not <br />
+        elif not element.text and element.tag not in \
+                cleanse_html_allowed_empty_tags and not \
+                len(list(element.iterdescendants())):
+            element.drop_tag()
+            continue
+
+        # remove all attributes which are not explicitly allowed
+        allowed = cleanse_html_allowed.get(element.tag, [])
+        for key in element.attrib.keys():
+            if key not in allowed:
+                del element.attrib[key]
+
+    html = lxml.html.tostring(doc, method='xml')
+
+    html = re.sub(r'</?anything>', '', html)
+
+    # remove elements containing only whitespace or linebreaks
+    whitespace_re = re.compile(r'<([a-z]+)>(<br\s*/>|&#160;|\s)*</\1>')
+    while True:
+        new = whitespace_re.sub('', html)
+        if new == html:
+            break
+        html = new
+
+    # add a space before the closing slash in empty tags
+    html = re.sub(r'<([^/>]+)/>', r'<\1 />', html)
+
+    return html
diff --git a/feincms/content/richtext/models.py b/feincms/content/richtext/models.py
@@ -3,6 +3,7 @@
 from django.utils.translation import ugettext_lazy as _
 
 from feincms import settings
+from feincms.content.richtext.cleanse import cleanse_html
 
 
 class RichTextContent(models.Model):
@@ -21,3 +22,7 @@ class Meta:
     def render(self, **kwargs):
         return mark_safe(self.text)
 
+    def save(self, *args, **kwargs):
+        if getattr(self, 'cleanse', False):
+            self.text = cleanse_html(self.text)
+        super(RichTextContent, self).save(*args, **kwargs)