diff --git a/html_sanitizer/sanitizer.py b/html_sanitizer/sanitizer.py index 81a1d16..a8436f0 100644 --- a/html_sanitizer/sanitizer.py +++ b/html_sanitizer/sanitizer.py @@ -138,6 +138,14 @@ def __init__(self, settings=None): set(self.attributes.keys()) - self.tags, )) + @staticmethod + def is_mergeable(e1, e2): + """ + Decide if the adjacent elements of the same type e1 and e2 can be + merged. This can be overriden to honouring distinct classes etc. + """ + return True + def sanitize(self, html): """ Clean HTML code from ugly copy-pasted CSS and empty elements @@ -232,7 +240,8 @@ def sanitize(self, html): # tag type nx = element.getnext() if (whitespace_re.match(element.tail or '') and - nx is not None and nx.tag == element.tag): + nx is not None and nx.tag == element.tag and + self.is_mergeable(element, nx)): # Yes, we should. Tail is empty, that is, no text between # tags of a mergeable type. if nx.text: diff --git a/html_sanitizer/tests.py b/html_sanitizer/tests.py index c409906..ab27c42 100644 --- a/html_sanitizer/tests.py +++ b/html_sanitizer/tests.py @@ -255,6 +255,9 @@ def test_14_classes(self): ), ( '

Test span

', '

Test span

', + ), ( + '

Test spanspan

', + '

Test span span

', ), ( '

Test

', '

Test

', @@ -263,7 +266,30 @@ def test_14_classes(self): '

Test

', )], sanitizer=sanitizer) - def test_15_emoji(self): + def test_15_classes(self): + """Class attributes may disable merging""" + sanitizer = Sanitizer({ + 'tags': {'h1', 'h2', 'p', 'a', 'span'}, + 'attributes': { + 'a': ('href', 'name', 'target', 'title', 'id'), + 'h1': ('class',), + 'p': ('class',), + 'span': ('class',), + }, + 'empty': set(), + 'separate': {'a', 'p'}, + 'is_mergeable': lambda e1, e2: e1.get('class') == e2.get('class'), + }) + + self.run_tests([( + '

Test spanspan

', + '

Test spanspan

', + ), ( + '

Test spanspan

', + '

Test span span

', + )], sanitizer=sanitizer) + + def test_16_emoji(self): self.run_tests([( '

😂

', '

😂

',