Skip to content

Commit

Permalink
Merge pull request #692 from willkg/501-linkify-nbsp
Browse files Browse the repository at this point in the history
Fix linkify with character entities (#501)
  • Loading branch information
willkg authored Jan 12, 2023
2 parents 459d370 + 515bfab commit 4971efa
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
65 changes: 62 additions & 3 deletions bleach/linkifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,16 @@ def __init__(
self.parser = html5lib_shim.BleachHTMLParser(
tags=recognized_tags,
strip=False,
consume_entities=True,
consume_entities=False,
namespaceHTMLElements=False,
)
self.walker = html5lib_shim.getTreeWalker("etree")
self.serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values="always",
omit_optional_tags=False,
# We want to leave entities as they are without escaping or
# resolving or expanding
resolve_entities=False,
# linkify does not sanitize
sanitize=False,
# linkify preserves attr order
Expand Down Expand Up @@ -510,6 +513,62 @@ def handle_a_tag(self, token_buffer):
yield {"type": "Characters", "data": str(new_text)}
yield token_buffer[-1]

def extract_entities(self, token):
"""Handles Characters tokens with entities
Our overridden tokenizer doesn't do anything with entities. However,
that means that the serializer will convert all ``&`` in Characters
tokens to ``&``.
Since we don't want that, we extract entities here and convert them to
Entity tokens so the serializer will let them be.
:arg token: the Characters token to work on
:returns: generator of tokens
"""
data = token.get("data", "")

# If there isn't a & in the data, we can return now
if "&" not in data:
yield token
return

new_tokens = []

# For each possible entity that starts with a "&", we try to extract an
# actual entity and re-tokenize accordingly
for part in html5lib_shim.next_possible_entity(data):
if not part:
continue

if part.startswith("&"):
entity = html5lib_shim.match_entity(part)
if entity is not None:
if entity == "amp":
# LinkifyFilter can't match urls across token boundaries
# which is problematic with & since that shows up in
# querystrings all the time. This special-cases &
# and converts it to a & and sticks it in as a
# Characters token. It'll get merged with surrounding
# tokens in the BleachSanitizerfilter.__iter__ and
# escaped in the serializer.
new_tokens.append({"type": "Characters", "data": "&"})
else:
new_tokens.append({"type": "Entity", "name": entity})

# Length of the entity plus 2--one for & at the beginning
# and one for ; at the end
remainder = part[len(entity) + 2 :]
if remainder:
new_tokens.append({"type": "Characters", "data": remainder})
continue

new_tokens.append({"type": "Characters", "data": part})

yield from new_tokens

def __iter__(self):
in_a = False
in_skip_tag = None
Expand Down Expand Up @@ -564,8 +623,8 @@ def __iter__(self):

new_stream = self.handle_links(new_stream)

for token in new_stream:
yield token
for new_token in new_stream:
yield from self.extract_entities(new_token)

# We've already yielded this token, so continue
continue
Expand Down
6 changes: 5 additions & 1 deletion tests/test_linkify.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,13 +324,17 @@ def test_link_fragment():
)


def test_link_entities():
def test_link_entities_in_qs():
assert (
linkify("http://xx.com/?a=1&b=2")
== '<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">http://xx.com/?a=1&amp;b=2</a>'
)


def test_link_entities_in_characters_token():
assert linkify("foo &nbsp; bar") == "foo &nbsp; bar"


def test_escaped_html():
"""If I pass in escaped HTML, it should probably come out escaped."""
s = "&lt;em&gt;strong&lt;/em&gt;"
Expand Down

0 comments on commit 4971efa

Please sign in to comment.