From 349aa5f4127fcc79695225868b5c7265eb74d4c4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 8 Sep 2021 15:15:05 -0400 Subject: [PATCH 1/6] Provide a better title for previews. --- synapse/rest/media/v1/oembed.py | 12 +++++++++++- tests/rest/media/v1/test_url_preview.py | 7 ++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 8b74e72655bc..8090f648b9a5 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -122,7 +122,17 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: cache_age = int(cache_age) # The results. - open_graph_response = {"og:title": oembed.get("title")} + open_graph_response = {} + + # Use either title or author's name as the title. + title = oembed.get("title") or oembed.get("author_name") + if title: + open_graph_response["og:title"] = title + + # Use the provider name and as the site. + provider_name = oembed.get("provider_name") + if provider_name: + open_graph_response["og:site_name"] = provider_name # If a thumbnail exists, use it. Note that dimensions will be calculated later. if "thumbnail_url" in oembed: diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 9d1389958406..31a87c25fe1f 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -620,7 +620,6 @@ def test_oembed_photo(self): self.assertIn(b"/matrixdotorg", server.data) self.assertEqual(channel.code, 200) - self.assertIsNone(channel.json_body["og:title"]) self.assertTrue(channel.json_body["og:image"].startswith("mxc://")) self.assertEqual(channel.json_body["og:image:height"], 1) self.assertEqual(channel.json_body["og:image:width"], 1) @@ -633,6 +632,8 @@ def test_oembed_rich(self): result = { "version": "1.0", "type": "rich", + # Note that this provides the author, not the title. + "author_name": "Alice", "html": "
Content Preview
", } end_content = json.dumps(result).encode("utf-8") @@ -662,7 +663,7 @@ def test_oembed_rich(self): self.assertEqual(channel.code, 200) self.assertEqual( channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + {"og:title": "Alice", "og:description": "Content Preview"}, ) def test_oembed_format(self): @@ -707,5 +708,5 @@ def test_oembed_format(self): self.assertEqual(channel.code, 200) self.assertEqual( channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + {"og:description": "Content Preview"}, ) From 41db6a765f18715d39a3372910b983f4088e86b9 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 8 Sep 2021 15:19:39 -0400 Subject: [PATCH 2/6] Attempt to parse photo/video URLs. --- synapse/rest/media/v1/oembed.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 8090f648b9a5..63fe2734949b 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import urllib.parse -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, List, Optional import attr @@ -22,6 +22,8 @@ from synapse.util import json_decoder if TYPE_CHECKING: + from lxml import etree + from synapse.server import HomeServer logger = logging.getLogger(__name__) @@ -147,6 +149,15 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: # If this is a photo, use the full image, not the thumbnail. open_graph_response["og:image"] = oembed["url"] + elif oembed_type == "video": + open_graph_response["og:type"] = "video.other" + calc_description_and_urls(open_graph_response, oembed["html"]) + open_graph_response["og:video:width"] = oembed["width"] + open_graph_response["og:video:height"] = oembed["height"] + + elif oembed_type == "link": + open_graph_response["og:type"] = "website" + else: raise RuntimeError(f"Unknown oEmbed type: {oembed_type}") @@ -159,6 +170,14 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: return OEmbedResult(open_graph_response, cache_age) +def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: + results = [] + for tag in tree.xpath("//*/" + tag_name): + if "src" in tag.attrib: + results.append(tag.attrib["src"]) + return results + + def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: """ Calculate description for an HTML document. @@ -189,6 +208,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> if tree is None: return + # Attempt to find interesting URLs (images, videos, embeds). + if "og:image" not in open_graph_response: + image_urls = _fetch_urls(tree, "img") + if image_urls: + open_graph_response["og:image"] = image_urls[0] + + video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") + if video_urls: + open_graph_response["og:video"] = video_urls[0] + from synapse.rest.media.v1.preview_url_resource import _calc_description description = _calc_description(tree) From 0265a7a6e32a396e79a8ddfeb2d1128b3767604e Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 8 Sep 2021 15:22:16 -0400 Subject: [PATCH 3/6] Include the original URL. --- synapse/rest/media/v1/oembed.py | 4 +++- tests/rest/media/v1/test_url_preview.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 63fe2734949b..408c3e6b8328 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -124,7 +124,9 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: cache_age = int(cache_age) # The results. - open_graph_response = {} + open_graph_response = { + "og:url": url, + } # Use either title or author's name as the title. title = oembed.get("title") or oembed.get("author_name") diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 31a87c25fe1f..1140452e43aa 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -620,6 +620,7 @@ def test_oembed_photo(self): self.assertIn(b"/matrixdotorg", server.data) self.assertEqual(channel.code, 200) + self.assertIn("og:url", channel.json_body) self.assertTrue(channel.json_body["og:image"].startswith("mxc://")) self.assertEqual(channel.json_body["og:image:height"], 1) self.assertEqual(channel.json_body["og:image:width"], 1) @@ -661,9 +662,11 @@ def test_oembed_rich(self): self.pump() self.assertEqual(channel.code, 200) + # The JSON body should have a URL, but we don't really care what it is. + body = channel.json_body + body.pop("og:url") self.assertEqual( - channel.json_body, - {"og:title": "Alice", "og:description": "Content Preview"}, + body, {"og:title": "Alice", "og:description": "Content Preview"} ) def test_oembed_format(self): @@ -706,7 +709,7 @@ def test_oembed_format(self): self.assertIn(b"format=json", server.data) self.assertEqual(channel.code, 200) - self.assertEqual( - channel.json_body, - {"og:description": "Content Preview"}, - ) + # The JSON body should have a URL, but we don't really care what it is. + body = channel.json_body + body.pop("og:url") + self.assertEqual(body, {"og:description": "Content Preview"}) From a6cf90509902a8364033d40a27c00a49ca6f7645 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 14 Sep 2021 12:35:54 -0400 Subject: [PATCH 4/6] Newsfragment --- changelog.d/10819.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/10819.feature diff --git a/changelog.d/10819.feature b/changelog.d/10819.feature new file mode 100644 index 000000000000..4fa95a6cc968 --- /dev/null +++ b/changelog.d/10819.feature @@ -0,0 +1 @@ +Improve oEmbed previews by processing the author name, photo, and video information. From a3bb63c4779f90fbf8ea8b09a63fc91a465accdd Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 21 Sep 2021 12:33:17 -0400 Subject: [PATCH 5/6] The cache age should be in milliseconds. --- synapse/rest/media/v1/oembed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 408c3e6b8328..e04671fb95cf 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -33,7 +33,7 @@ class OEmbedResult: # The Open Graph result (converted from the oEmbed result). open_graph_result: JsonDict - # Number of seconds to cache the content, according to the oEmbed response. + # Number of milliseconds to cache the content, according to the oEmbed response. # # This will be None if no cache-age is provided in the oEmbed response (or # if the oEmbed response cannot be turned into an Open Graph response). @@ -121,7 +121,7 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: # Ensure the cache age is None or an int. cache_age = oembed.get("cache_age") if cache_age: - cache_age = int(cache_age) + cache_age = int(cache_age) * 1000 # The results. open_graph_response = { From 7dcf07f62861088119a07b2570128079079bfa89 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Sep 2021 09:14:42 -0400 Subject: [PATCH 6/6] Pass the proper URL to the oEmbed parser. --- synapse/rest/media/v1/preview_url_resource.py | 2 +- tests/rest/media/v1/test_url_preview.py | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0a0b476d2b26..9ffa983fbb53 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -305,7 +305,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: with open(media_info.filename, "rb") as file: body = file.read() - oembed_response = self._oembed.parse_oembed_response(media_info.uri, body) + oembed_response = self._oembed.parse_oembed_response(url, body) og = oembed_response.open_graph_result # Use the cache age from the oEmbed result, instead of the HTTP response. diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 1140452e43aa..d83dfacfedc0 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -620,11 +620,12 @@ def test_oembed_photo(self): self.assertIn(b"/matrixdotorg", server.data) self.assertEqual(channel.code, 200) - self.assertIn("og:url", channel.json_body) - self.assertTrue(channel.json_body["og:image"].startswith("mxc://")) - self.assertEqual(channel.json_body["og:image:height"], 1) - self.assertEqual(channel.json_body["og:image:width"], 1) - self.assertEqual(channel.json_body["og:image:type"], "image/png") + body = channel.json_body + self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345") + self.assertTrue(body["og:image"].startswith("mxc://")) + self.assertEqual(body["og:image:height"], 1) + self.assertEqual(body["og:image:width"], 1) + self.assertEqual(body["og:image:type"], "image/png") def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" @@ -662,11 +663,14 @@ def test_oembed_rich(self): self.pump() self.assertEqual(channel.code, 200) - # The JSON body should have a URL, but we don't really care what it is. body = channel.json_body - body.pop("og:url") self.assertEqual( - body, {"og:title": "Alice", "og:description": "Content Preview"} + body, + { + "og:url": "http://twitter.com/matrixdotorg/status/12345", + "og:title": "Alice", + "og:description": "Content Preview", + }, ) def test_oembed_format(self): @@ -709,7 +713,11 @@ def test_oembed_format(self): self.assertIn(b"format=json", server.data) self.assertEqual(channel.code, 200) - # The JSON body should have a URL, but we don't really care what it is. body = channel.json_body - body.pop("og:url") - self.assertEqual(body, {"og:description": "Content Preview"}) + self.assertEqual( + body, + { + "og:url": "http://www.hulu.com/watch/12345", + "og:description": "Content Preview", + }, + )