Merge pull request #692 from willkg/501-linkify-nbsp

Fix linkify with character entities (#501)
mozilla · Jan 12, 2023 · 4971efa · 4971efa
2 parents 459d370 + 515bfab
commit 4971efa
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 4 deletions.
diff --git a/bleach/linkifier.py b/bleach/linkifier.py
@@ -147,13 +147,16 @@ def __init__(
         self.parser = html5lib_shim.BleachHTMLParser(
             tags=recognized_tags,
             strip=False,
-            consume_entities=True,
+            consume_entities=False,
             namespaceHTMLElements=False,
         )
         self.walker = html5lib_shim.getTreeWalker("etree")
         self.serializer = html5lib_shim.BleachHTMLSerializer(
             quote_attr_values="always",
             omit_optional_tags=False,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
             # linkify does not sanitize
             sanitize=False,
             # linkify preserves attr order
@@ -510,6 +513,62 @@ def handle_a_tag(self, token_buffer):
                 yield {"type": "Characters", "data": str(new_text)}
                 yield token_buffer[-1]
 
+    def extract_entities(self, token):
+        """Handles Characters tokens with entities
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: generator of tokens
+
+        """
+        data = token.get("data", "")
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            yield token
+            return
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        yield from new_tokens
+
     def __iter__(self):
         in_a = False
         in_skip_tag = None
@@ -564,8 +623,8 @@ def __iter__(self):
 
                 new_stream = self.handle_links(new_stream)
 
-                for token in new_stream:
-                    yield token
+                for new_token in new_stream:
+                    yield from self.extract_entities(new_token)
 
                 # We've already yielded this token, so continue
                 continue

diff --git a/tests/test_linkify.py b/tests/test_linkify.py
@@ -324,13 +324,17 @@ def test_link_fragment():
     )
 
 
-def test_link_entities():
+def test_link_entities_in_qs():
     assert (
         linkify("http://xx.com/?a=1&b=2")
         == '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
     )
 
 
+def test_link_entities_in_characters_token():
+    assert linkify("foo &nbsp; bar") == "foo &nbsp; bar"
+
+
 def test_escaped_html():
     """If I pass in escaped HTML, it should probably come out escaped."""
     s = "&lt;em&gt;strong&lt;/em&gt;"