Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Clean-up logic for rebasing URLs during URL preview. (#12219)
Browse files Browse the repository at this point in the history
By using urljoin from the standard library and reducing the number
of places URLs are rebased.
  • Loading branch information
clokep authored Mar 16, 2022
1 parent dda9b7f commit 4587b35
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 91 deletions.
1 change: 1 addition & 0 deletions changelog.d/12219.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Clean-up logic around rebasing URLs for URL image previews.
39 changes: 2 additions & 37 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from urllib import parse as urlparse

if TYPE_CHECKING:
from lxml import etree
Expand Down Expand Up @@ -144,9 +143,7 @@ def decode_body(
return etree.fromstring(body, parser)


def parse_html_to_open_graph(
tree: "etree.Element", media_uri: str
) -> Dict[str, Optional[str]]:
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
Expand All @@ -155,7 +152,6 @@ def parse_html_to_open_graph(
Args:
tree: The parsed HTML document.
media_url: The URI used to download the body.
Returns:
The Open Graph response as a dictionary.
Expand Down Expand Up @@ -209,7 +205,7 @@ def parse_html_to_open_graph(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
)
if meta_image:
og["og:image"] = rebase_url(meta_image[0], media_uri)
og["og:image"] = meta_image[0]
else:
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
Expand Down Expand Up @@ -320,37 +316,6 @@ def _iterate_over_text(
)


def rebase_url(url: str, base: str) -> str:
"""
Resolves a potentially relative `url` against an absolute `base` URL.
For example:
>>> rebase_url("subpage", "https://example.com/foo/")
'https://example.com/foo/subpage'
>>> rebase_url("sibling", "https://example.com/foo")
'https://example.com/sibling'
>>> rebase_url("/bar", "https://example.com/foo/")
'https://example.com/bar'
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
'https://alice.com/a'
"""
base_parts = urlparse.urlparse(base)
# Convert the parsed URL to a list for (potential) modification.
url_parts = list(urlparse.urlparse(url))
# Add a scheme, if one does not exist.
if not url_parts[0]:
url_parts[0] = base_parts.scheme or "http"
# Fix up the hostname, if this is not a data URL.
if url_parts[0] != "data" and not url_parts[1]:
url_parts[1] = base_parts.netloc
# If the path does not start with a /, nest it under the base path's last
# directory.
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
return urlparse.urlunparse(url_parts)


def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
Expand Down
23 changes: 12 additions & 11 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import sys
import traceback
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
from urllib import parse as urlparse
from urllib.parse import urljoin, urlparse, urlsplit
from urllib.request import urlopen

import attr
Expand All @@ -44,11 +44,7 @@
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.oembed import OEmbedProvider
from synapse.rest.media.v1.preview_html import (
decode_body,
parse_html_to_open_graph,
rebase_url,
)
from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
from synapse.types import JsonDict, UserID
from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred
Expand Down Expand Up @@ -187,7 +183,7 @@ async def _async_render_GET(self, request: SynapseRequest) -> None:
ts = self.clock.time_msec()

# XXX: we could move this into _do_preview if we wanted.
url_tuple = urlparse.urlsplit(url)
url_tuple = urlsplit(url)
for entry in self.url_preview_url_blacklist:
match = True
for attrib in entry:
Expand Down Expand Up @@ -322,7 +318,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:

# Parse Open Graph information from the HTML in case the oEmbed
# response failed or is incomplete.
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
og_from_html = parse_html_to_open_graph(tree)

# Compile the Open Graph response by using the scraped
# information from the HTML and overlaying any information
Expand Down Expand Up @@ -588,12 +584,17 @@ async def _precache_image_url(
if "og:image" not in og or not og["og:image"]:
return

# The image URL from the HTML might be relative to the previewed page,
# convert it to an URL which can be requested directly.
image_url = og["og:image"]
url_parts = urlparse(image_url)
if url_parts.scheme != "data":
image_url = urljoin(media_info.uri, image_url)

# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._handle_url(
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
)
image_info = await self._handle_url(image_url, user, allow_data_urls=True)

if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
Expand Down
54 changes: 11 additions & 43 deletions tests/rest/media/v1/test_html_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
rebase_url,
summarize_paragraphs,
)

Expand Down Expand Up @@ -161,7 +160,7 @@ def test_simple(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -177,7 +176,7 @@ def test_comment(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -196,7 +195,7 @@ def test_comment2(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(
og,
Expand All @@ -218,7 +217,7 @@ def test_script(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -232,7 +231,7 @@ def test_missing_title(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

Expand All @@ -247,7 +246,7 @@ def test_h1_as_title(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

Expand All @@ -262,7 +261,7 @@ def test_missing_title_and_broken_h1(self) -> None:
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

Expand Down Expand Up @@ -290,7 +289,7 @@ def test_xml(self) -> None:
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding(self) -> None:
Expand All @@ -304,7 +303,7 @@ def test_invalid_encoding(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding2(self) -> None:
Expand All @@ -319,7 +318,7 @@ def test_invalid_encoding2(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

def test_windows_1252(self) -> None:
Expand All @@ -333,7 +332,7 @@ def test_windows_1252(self) -> None:
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})


Expand Down Expand Up @@ -448,34 +447,3 @@ def test_unknown_invalid(self) -> None:
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])


class RebaseUrlTestCase(unittest.TestCase):
def test_relative(self) -> None:
"""Relative URLs should be resolved based on the context of the base URL."""
self.assertEqual(
rebase_url("subpage", "https://example.com/foo/"),
"https://example.com/foo/subpage",
)
self.assertEqual(
rebase_url("sibling", "https://example.com/foo"),
"https://example.com/sibling",
)
self.assertEqual(
rebase_url("/bar", "https://example.com/foo/"),
"https://example.com/bar",
)

def test_absolute(self) -> None:
"""Absolute URLs should not be modified."""
self.assertEqual(
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
"https://alice.com/a/",
)

def test_data(self) -> None:
"""Data URLs should not be modified."""
self.assertEqual(
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
"data:,Hello%2C%20World%21",
)

0 comments on commit 4587b35

Please sign in to comment.