Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve oEmbed previews #10819

Merged
merged 6 commits into from
Sep 22, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10819.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve oEmbed previews by processing the author name, photo, and video information.
49 changes: 45 additions & 4 deletions synapse/rest/media/v1/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
import logging
import urllib.parse
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, List, Optional

import attr

Expand All @@ -22,6 +22,8 @@
from synapse.util import json_decoder

if TYPE_CHECKING:
from lxml import etree

from synapse.server import HomeServer

logger = logging.getLogger(__name__)
Expand All @@ -31,7 +33,7 @@
class OEmbedResult:
# The Open Graph result (converted from the oEmbed result).
open_graph_result: JsonDict
# Number of seconds to cache the content, according to the oEmbed response.
# Number of milliseconds to cache the content, according to the oEmbed response.
#
# This will be None if no cache-age is provided in the oEmbed response (or
# if the oEmbed response cannot be turned into an Open Graph response).
Expand Down Expand Up @@ -119,10 +121,22 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
# Ensure the cache age is None or an int.
cache_age = oembed.get("cache_age")
if cache_age:
cache_age = int(cache_age)
cache_age = int(cache_age) * 1000

# The results.
open_graph_response = {"og:title": oembed.get("title")}
open_graph_response = {
"og:url": url,
}
squahtx marked this conversation as resolved.
Show resolved Hide resolved

# Use either title or author's name as the title.
title = oembed.get("title") or oembed.get("author_name")
if title:
open_graph_response["og:title"] = title

# Use the provider name and as the site.
provider_name = oembed.get("provider_name")
if provider_name:
open_graph_response["og:site_name"] = provider_name

# If a thumbnail exists, use it. Note that dimensions will be calculated later.
if "thumbnail_url" in oembed:
Expand All @@ -137,6 +151,15 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
# If this is a photo, use the full image, not the thumbnail.
open_graph_response["og:image"] = oembed["url"]

elif oembed_type == "video":
open_graph_response["og:type"] = "video.other"
calc_description_and_urls(open_graph_response, oembed["html"])
open_graph_response["og:video:width"] = oembed["width"]
open_graph_response["og:video:height"] = oembed["height"]

elif oembed_type == "link":
open_graph_response["og:type"] = "website"

else:
raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")

Expand All @@ -149,6 +172,14 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, cache_age)


def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
results = []
for tag in tree.xpath("//*/" + tag_name):
if "src" in tag.attrib:
results.append(tag.attrib["src"])
return results


def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
"""
Calculate description for an HTML document.
Expand Down Expand Up @@ -179,6 +210,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
if tree is None:
return

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
image_urls = _fetch_urls(tree, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]

video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]

from synapse.rest.media.v1.preview_url_resource import _calc_description

description = _calc_description(tree)
Expand Down
18 changes: 11 additions & 7 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def test_oembed_photo(self):
self.assertIn(b"/matrixdotorg", server.data)

self.assertEqual(channel.code, 200)
self.assertIsNone(channel.json_body["og:title"])
self.assertIn("og:url", channel.json_body)
self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
self.assertEqual(channel.json_body["og:image:height"], 1)
self.assertEqual(channel.json_body["og:image:width"], 1)
Expand All @@ -633,6 +633,8 @@ def test_oembed_rich(self):
result = {
"version": "1.0",
"type": "rich",
# Note that this provides the author, not the title.
"author_name": "Alice",
"html": "<div>Content Preview</div>",
}
end_content = json.dumps(result).encode("utf-8")
Expand Down Expand Up @@ -660,9 +662,11 @@ def test_oembed_rich(self):

self.pump()
self.assertEqual(channel.code, 200)
# The JSON body should have a URL, but we don't really care what it is.
body = channel.json_body
body.pop("og:url")
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
body, {"og:title": "Alice", "og:description": "Content Preview"}
)

def test_oembed_format(self):
Expand Down Expand Up @@ -705,7 +709,7 @@ def test_oembed_format(self):
self.assertIn(b"format=json", server.data)

self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
)
# The JSON body should have a URL, but we don't really care what it is.
body = channel.json_body
body.pop("og:url")
self.assertEqual(body, {"og:description": "Content Preview"})