Autodiscover oEmbed endpoint from returned HTML (#10822)

Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (`<link rel=...>`), and will request it to generate the preview.
matrix-org · Oct 8, 2021 · 1b11284 · 1b11284
1 parent 593eeac
commit 1b11284
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 55 deletions.
diff --git a/changelog.d/10822.feature b/changelog.d/10822.feature
@@ -0,0 +1 @@
+Support autodiscovery of oEmbed previews.
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
@@ -96,6 +96,32 @@ def get_oembed_url(self, url: str) -> Optional[str]:
  # No match.
  return None
 
+ def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+ """
+ Search an HTML document for oEmbed autodiscovery information.
+
+ Args:
+ tree: The parsed HTML body.
+
+ Returns:
+ The URL to use for oEmbed information, or None if no URL was found.
+ """
+ # Search for link elements with the proper rel and type attributes.
+ for tag in tree.xpath(
+ "//link[@rel='alternate'][@type='application/json+oembed']"
+ ):
+ if "href" in tag.attrib:
+ return tag.attrib["href"]
+
+ # Some providers (e.g. Flickr) use alternative instead of alternate.
+ for tag in tree.xpath(
+ "//link[@rel='alternative'][@type='application/json+oembed']"
+ ):
+ if "href" in tag.attrib:
+ return tag.attrib["href"]
+
+ return None
+
  def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
  """
  Parse the oEmbed response into an Open Graph response.

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
@@ -22,7 +22,7 @@
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
 from urllib import parse as urlparse
 
 import attr
@@ -296,22 +296,32 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
  body = file.read()
 
  encoding = get_html_media_encoding(body, media_info.media_type)
- og = decode_and_calc_og(body, media_info.uri, encoding)
-
- await self._precache_image_url(user, media_info, og)
-
- elif oembed_url and _is_json(media_info.media_type):
- # Handle an oEmbed response.
- with open(media_info.filename, "rb") as file:
- body = file.read()
-
- oembed_response = self._oembed.parse_oembed_response(url, body)
- og = oembed_response.open_graph_result
-
- # Use the cache age from the oEmbed result, instead of the HTTP response.
- if oembed_response.cache_age is not None:
- expiration_ms = oembed_response.cache_age
+ tree = decode_body(body, encoding)
+ if tree is not None:
+ # Check if this HTML document points to oEmbed information and
+ # defer to that.
+ oembed_url = self._oembed.autodiscover_from_html(tree)
+ og = {}
+ if oembed_url:
+ oembed_info = await self._download_url(oembed_url, user)
+ og, expiration_ms = await self._handle_oembed_response(
+ url, oembed_info, expiration_ms
+ )
+
+ # If there was no oEmbed URL (or oEmbed parsing failed), attempt
+ # to generate the Open Graph information from the HTML.
+ if not oembed_url or not og:
+ og = _calc_og(tree, media_info.uri)
+
+ await self._precache_image_url(user, media_info, og)
+ else:
+ og = {}
 
+ elif oembed_url:
+ # Handle the oEmbed information.
+ og, expiration_ms = await self._handle_oembed_response(
+ url, media_info, expiration_ms
+ )
  await self._precache_image_url(user, media_info, og)
 
  else:
@@ -479,6 +489,39 @@ async def _precache_image_url(
  else:
  del og["og:image"]
 
+ async def _handle_oembed_response(
+ self, url: str, media_info: MediaInfo, expiration_ms: int
+ ) -> Tuple[JsonDict, int]:
+ """
+ Parse the downloaded oEmbed info.
+
+ Args:
+ url: The URL which is being previewed (not the one which was
+ requested).
+ media_info: The media being previewed.
+ expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+ Returns:
+ A tuple of:
+ The Open Graph dictionary, if the oEmbed info can be parsed.
+ The (possibly updated) length of time, in milliseconds, the media is valid for.
+ """
+ # If JSON was not returned, there's nothing to do.
+ if not _is_json(media_info.media_type):
+ return {}, expiration_ms
+
+ with open(media_info.filename, "rb") as file:
+ body = file.read()
+
+ oembed_response = self._oembed.parse_oembed_response(url, body)
+ open_graph_result = oembed_response.open_graph_result
+
+ # Use the cache age from the oEmbed result, if one was given.
+ if open_graph_result and oembed_response.cache_age is not None:
+ expiration_ms = oembed_response.cache_age
+
+ return open_graph_result, expiration_ms
+
  def _start_expire_url_cache_data(self) -> Deferred:
  return run_as_background_process(
  "expire_url_cache_data", self._expire_url_cache_data
@@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
  return "utf-8"
 
 
-def decode_and_calc_og(
- body: bytes, media_uri: str, request_encoding: Optional[str] = None
-) -> JsonDict:
+def decode_body(
+ body: bytes, request_encoding: Optional[str] = None
+) -> Optional["etree.Element"]:
  """
- Calculate metadata for an HTML document.
-
- This uses lxml to parse the HTML document into the OG response. If errors
- occur during processing of the document, an empty response is returned.
+ This uses lxml to parse the HTML document.
 
  Args:
  body: The HTML document, as bytes.
- media_url: The URI used to download the body.
  request_encoding: The character encoding of the body, as a string.
 
  Returns:
- The OG response as a dictionary.
+ The parsed HTML body, or None if an error occurred during processed.
  """
  # If there's no body, nothing useful is going to be found.
  if not body:
- return {}
+ return None
 
  from lxml import etree
 
@@ -662,25 +701,22 @@ def decode_and_calc_og(
  parser = etree.HTMLParser(recover=True, encoding="utf-8")
  except Exception as e:
  logger.warning("Unable to create HTML parser: %s" % (e,))
- return {}
-
- def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
- # Attempt to parse the body. If this fails, log and return no metadata.
- tree = etree.fromstring(body_attempt, parser)
-
- # The data was successfully parsed, but no tree was found.
- if tree is None:
- return {}
+ return None
 
- return _calc_og(tree, media_uri)
+ def _attempt_decode_body(
+ body_attempt: Union[bytes, str]
+ ) -> Optional["etree.Element"]:
+ # Attempt to parse the body. Returns None if the body was successfully
+ # parsed, but no tree was found.
+ return etree.fromstring(body_attempt, parser)
 
  # Attempt to parse the body. If this fails, log and return no metadata.
  try:
- return _attempt_calc_og(body)
+ return _attempt_decode_body(body)
  except UnicodeDecodeError:
  # blindly try decoding the body as utf-8, which seems to fix
  # the charset mismatches on https://google.com
- return _attempt_calc_og(body.decode("utf-8", "ignore"))
+ return _attempt_decode_body(body.decode("utf-8", "ignore"))
 
 
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:

diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
@@ -725,9 +725,107 @@ def test_oembed_format(self):
  },
  )
 
+ def test_oembed_autodiscovery(self):
+ """
+ Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
+ 1. Request a preview of a URL which is not known to the oEmbed code.
+ 2. It returns HTML including a link to an oEmbed preview.
+ 3. The oEmbed preview is requested and returns a URL for an image.
+ 4. The image is requested for thumbnailing.
+ """
+ # This is a little cheesy in that we use the www subdomain (which isn't the
+ # list of oEmbed patterns) to get "raw" HTML response.
+ self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+ self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+ self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+ result = b"""
+ <link rel="alternate" type="application/json+oembed"
+ href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
+ title="matrixdotorg" />
+ """
+
+ channel = self.make_request(
+ "GET",
+ "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
+ shorthand=False,
+ await_result=False,
+ )
+ self.pump()
+
+ client = self.reactor.tcpClients[0][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+ )
+ % (len(result),)
+ + result
+ )
+
+ self.pump()
+
+ # The oEmbed response.
+ result2 = {
+ "version": "1.0",
+ "type": "photo",
+ "url": "http://cdn.twitter.com/matrixdotorg",
+ }
+ oembed_content = json.dumps(result2).encode("utf-8")
+
+ # Ensure a second request is made to the oEmbed URL.
+ client = self.reactor.tcpClients[1][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+ )
+ % (len(oembed_content),)
+ + oembed_content
+ )
+
+ self.pump()
+
+ # Ensure the URL is what was requested.
+ self.assertIn(b"/oembed?", server.data)
+
+ # Ensure a third request is made to the photo URL.
+ client = self.reactor.tcpClients[2][2].buildProtocol(None)
+ server = AccumulatingProtocol()
+ server.makeConnection(FakeTransport(client, self.reactor))
+ client.makeConnection(FakeTransport(server, self.reactor))
+ client.dataReceived(
+ (
+ b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+ b"Content-Type: image/png\r\n\r\n"
+ )
+ % (len(SMALL_PNG),)
+ + SMALL_PNG
+ )
+
+ self.pump()
+
+ # Ensure the URL is what was requested.
+ self.assertIn(b"/matrixdotorg", server.data)
+
+ self.assertEqual(channel.code, 200)
+ body = channel.json_body
+ self.assertEqual(
+ body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
+ )
+ self.assertTrue(body["og:image"].startswith("mxc://"))
+ self.assertEqual(body["og:image:height"], 1)
+ self.assertEqual(body["og:image:width"], 1)
+ self.assertEqual(body["og:image:type"], "image/png")
+
  def _download_image(self):
  """Downloads an image into the URL cache.
-
  Returns:
  A (host, media_id) tuple representing the MXC URI of the image.
  """