Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Autodiscover oEmbed endpoint from returned HTML (#10822)
Browse files Browse the repository at this point in the history
Searches the returned HTML for an oEmbed endpoint using the
autodiscovery mechanism (`<link rel=...>`), and will request it
to generate the preview.
  • Loading branch information
clokep committed Oct 8, 2021
1 parent 593eeac commit 1b11284
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 55 deletions.
1 change: 1 addition & 0 deletions changelog.d/10822.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support autodiscovery of oEmbed previews.
26 changes: 26 additions & 0 deletions synapse/rest/media/v1/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,32 @@ def get_oembed_url(self, url: str) -> Optional[str]:
# No match.
return None

def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Args:
tree: The parsed HTML body.
Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]

# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]

return None

def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Parse the oEmbed response into an Open Graph response.
Expand Down
112 changes: 74 additions & 38 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import shutil
import sys
import traceback
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
from urllib import parse as urlparse

import attr
Expand Down Expand Up @@ -296,22 +296,32 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
body = file.read()

encoding = get_html_media_encoding(body, media_info.media_type)
og = decode_and_calc_og(body, media_info.uri, encoding)

await self._precache_image_url(user, media_info, og)

elif oembed_url and _is_json(media_info.media_type):
# Handle an oEmbed response.
with open(media_info.filename, "rb") as file:
body = file.read()

oembed_response = self._oembed.parse_oembed_response(url, body)
og = oembed_response.open_graph_result

# Use the cache age from the oEmbed result, instead of the HTTP response.
if oembed_response.cache_age is not None:
expiration_ms = oembed_response.cache_age
tree = decode_body(body, encoding)
if tree is not None:
# Check if this HTML document points to oEmbed information and
# defer to that.
oembed_url = self._oembed.autodiscover_from_html(tree)
og = {}
if oembed_url:
oembed_info = await self._download_url(oembed_url, user)
og, expiration_ms = await self._handle_oembed_response(
url, oembed_info, expiration_ms
)

# If there was no oEmbed URL (or oEmbed parsing failed), attempt
# to generate the Open Graph information from the HTML.
if not oembed_url or not og:
og = _calc_og(tree, media_info.uri)

await self._precache_image_url(user, media_info, og)
else:
og = {}

elif oembed_url:
# Handle the oEmbed information.
og, expiration_ms = await self._handle_oembed_response(
url, media_info, expiration_ms
)
await self._precache_image_url(user, media_info, og)

else:
Expand Down Expand Up @@ -479,6 +489,39 @@ async def _precache_image_url(
else:
del og["og:image"]

async def _handle_oembed_response(
self, url: str, media_info: MediaInfo, expiration_ms: int
) -> Tuple[JsonDict, int]:
"""
Parse the downloaded oEmbed info.
Args:
url: The URL which is being previewed (not the one which was
requested).
media_info: The media being previewed.
expiration_ms: The length of time, in milliseconds, the media is valid for.
Returns:
A tuple of:
The Open Graph dictionary, if the oEmbed info can be parsed.
The (possibly updated) length of time, in milliseconds, the media is valid for.
"""
# If JSON was not returned, there's nothing to do.
if not _is_json(media_info.media_type):
return {}, expiration_ms

with open(media_info.filename, "rb") as file:
body = file.read()

oembed_response = self._oembed.parse_oembed_response(url, body)
open_graph_result = oembed_response.open_graph_result

# Use the cache age from the oEmbed result, if one was given.
if open_graph_result and oembed_response.cache_age is not None:
expiration_ms = oembed_response.cache_age

return open_graph_result, expiration_ms

def _start_expire_url_cache_data(self) -> Deferred:
return run_as_background_process(
"expire_url_cache_data", self._expire_url_cache_data
Expand Down Expand Up @@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
return "utf-8"


def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> JsonDict:
def decode_body(
body: bytes, request_encoding: Optional[str] = None
) -> Optional["etree.Element"]:
"""
Calculate metadata for an HTML document.
This uses lxml to parse the HTML document into the OG response. If errors
occur during processing of the document, an empty response is returned.
This uses lxml to parse the HTML document.
Args:
body: The HTML document, as bytes.
media_url: The URI used to download the body.
request_encoding: The character encoding of the body, as a string.
Returns:
The OG response as a dictionary.
The parsed HTML body, or None if an error occurred during processed.
"""
# If there's no body, nothing useful is going to be found.
if not body:
return {}
return None

from lxml import etree

Expand All @@ -662,25 +701,22 @@ def decode_and_calc_og(
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}

def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)

# The data was successfully parsed, but no tree was found.
if tree is None:
return {}
return None

return _calc_og(tree, media_uri)
def _attempt_decode_body(
body_attempt: Union[bytes, str]
) -> Optional["etree.Element"]:
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_attempt, parser)

# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_calc_og(body)
return _attempt_decode_body(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
return _attempt_calc_og(body.decode("utf-8", "ignore"))
return _attempt_decode_body(body.decode("utf-8", "ignore"))


def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
Expand Down
100 changes: 99 additions & 1 deletion tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,9 +725,107 @@ def test_oembed_format(self):
},
)

def test_oembed_autodiscovery(self):
"""
Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
1. Request a preview of a URL which is not known to the oEmbed code.
2. It returns HTML including a link to an oEmbed preview.
3. The oEmbed preview is requested and returns a URL for an image.
4. The image is requested for thumbnailing.
"""
# This is a little cheesy in that we use the www subdomain (which isn't the
# list of oEmbed patterns) to get "raw" HTML response.
self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]

result = b"""
<link rel="alternate" type="application/json+oembed"
href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
title="matrixdotorg" />
"""

channel = self.make_request(
"GET",
"preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(result),)
+ result
)

self.pump()

# The oEmbed response.
result2 = {
"version": "1.0",
"type": "photo",
"url": "http://cdn.twitter.com/matrixdotorg",
}
oembed_content = json.dumps(result2).encode("utf-8")

# Ensure a second request is made to the oEmbed URL.
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(oembed_content),)
+ oembed_content
)

self.pump()

# Ensure the URL is what was requested.
self.assertIn(b"/oembed?", server.data)

# Ensure a third request is made to the photo URL.
client = self.reactor.tcpClients[2][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: image/png\r\n\r\n"
)
% (len(SMALL_PNG),)
+ SMALL_PNG
)

self.pump()

# Ensure the URL is what was requested.
self.assertIn(b"/matrixdotorg", server.data)

self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
)
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")

def _download_image(self):
"""Downloads an image into the URL cache.
Returns:
A (host, media_id) tuple representing the MXC URI of the image.
"""
Expand Down
Loading

0 comments on commit 1b11284

Please sign in to comment.