From bfcb3960f48b8275ddea3fd2a842fd78825a3112 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 27 Aug 2021 12:04:03 -0400 Subject: [PATCH] Pull oEmbed from a providers.json file and allow overriding the configuration. --- changelog.d/10714.feature | 1 + docs/sample_config.yaml | 19 ++ synapse/config/homeserver.py | 2 + synapse/config/oembed.py | 178 +++++++++++++++ synapse/res/providers.json | 17 ++ synapse/rest/media/v1/oembed.py | 42 +--- synapse/rest/media/v1/preview_url_resource.py | 2 +- tests/rest/media/v1/test_url_preview.py | 212 +++++++++--------- 8 files changed, 330 insertions(+), 143 deletions(-) create mode 100644 changelog.d/10714.feature create mode 100644 synapse/config/oembed.py create mode 100644 synapse/res/providers.json diff --git a/changelog.d/10714.feature b/changelog.d/10714.feature new file mode 100644 index 000000000000..bcb5a199f629 --- /dev/null +++ b/changelog.d/10714.feature @@ -0,0 +1 @@ +Allow configuration the oEmbed URLs used for URL previews. diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 935841dbfa37..62a31efb9129 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1075,6 +1075,25 @@ url_preview_accept_language: # - en +oembed: + # By default, the providers.json from https://oembed.com/ is included + # with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true + + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json + + ## Captcha ## # See docs/CAPTCHA_SETUP.md for full details of configuring this. diff --git a/synapse/config/homeserver.py b/synapse/config/homeserver.py index 1f42a51857c6..442f1b9ac071 100644 --- a/synapse/config/homeserver.py +++ b/synapse/config/homeserver.py @@ -30,6 +30,7 @@ from .logger import LoggingConfig from .metrics import MetricsConfig from .modules import ModulesConfig +from .oembed import OembedConfig from .oidc import OIDCConfig from .password_auth_providers import PasswordAuthProviderConfig from .push import PushConfig @@ -65,6 +66,7 @@ class HomeServerConfig(RootConfig): LoggingConfig, RatelimitConfig, ContentRepositoryConfig, + OembedConfig, CaptchaConfig, VoipConfig, RegistrationConfig, diff --git a/synapse/config/oembed.py b/synapse/config/oembed.py new file mode 100644 index 000000000000..6d5110195695 --- /dev/null +++ b/synapse/config/oembed.py @@ -0,0 +1,178 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import re +from typing import Any, Dict, Iterable, List, Pattern +from urllib import parse as urlparse + +import attr +import pkg_resources + +from synapse.types import JsonDict + +from ._base import Config, ConfigError +from ._util import validate_config + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class OEmbedEndpointConfig: + # The API endpoint to fetch. + api_endpoint: str + # The patterns to match. + url_patterns: List[Pattern] + + +class OembedConfig(Config): + """oEmbed Configuration""" + + section = "oembed" + + def read_config(self, config, **kwargs): + oembed_config: Dict[str, Any] = config.get("oembed") or {} + + # A list of patterns which will be used. + self.oembed_patterns: List[OEmbedEndpointConfig] = list( + self._parse_and_validate_providers(oembed_config) + ) + + def _parse_and_validate_providers( + self, oembed_config: dict + ) -> Iterable[OEmbedEndpointConfig]: + """Extract and parse the oEmbed providers from the given JSON file. + + Returns a generator which yields the OidcProviderConfig objects + """ + # Whether to use the packaged providers.json file. + if not oembed_config.get("disable_default_providers") or False: + providers = json.load( + pkg_resources.resource_stream("synapse", "res/providers.json") + ) + yield from self._parse_and_validate_provider( + providers, config_path=("oembed",) + ) + + # The JSON files which includes additional provider information. + for i, file in enumerate(oembed_config.get("additional_providers") or []): + # TODO Error checking. + with open(file) as f: + providers = json.load(f) + + yield from self._parse_and_validate_provider( + providers, + config_path=( + "oembed", + "additional_providers", + f"", + ), + ) + + def _parse_and_validate_provider( + self, providers: List[JsonDict], config_path: Iterable[str] + ) -> Iterable[OEmbedEndpointConfig]: + # Ensure it is the proper form. + validate_config( + _OEMBED_PROVIDER_SCHEMA, + providers, + config_path=config_path, + ) + + # Parse it and yield each result. + for provider in providers: + # Each provider might have multiple API endpoints, each which + # might have multiple patterns to match. + for endpoint in provider["endpoints"]: + api_endpoint = endpoint["url"] + patterns = [ + self._glob_to_pattern(glob, config_path) + for glob in endpoint["schemes"] + ] + yield OEmbedEndpointConfig(api_endpoint, patterns) + + def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: + """ + Convert the glob into a sane regular expression to match against. The + rules followed will be slightly different for the domain portion vs. + the rest. + + 1. The scheme must be one of HTTP / HTTPS (and have no globs). + 2. The domain can have globs, but we limit it to characters that can + reasonably be a domain part. + TODO: This does not attempt to handle Unicode domain names. + TODO: The domain should not allow wildcard TLDs. + 3. Other parts allow a glob to be any one, or more, characters. + """ + results = urlparse.urlparse(glob) + + # Ensure the scheme does not have wildcards (and is a sane scheme). + if results.scheme not in {"http", "https"}: + raise ConfigError(f"Insecure oEmbed scheme: {results.scheme}", config_path) + + pattern = urlparse.urlunparse( + [ + results.scheme, + re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), + ] + + [re.escape(part).replace("\\*", ".+") for part in results[2:]] + ) + return re.compile(pattern) + + def generate_config_section(self, **kwargs): + return """\ + oembed: + # By default, the providers.json from https://oembed.com/ is included + # with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true + + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json + """ + + +_OEMBED_PROVIDER_SCHEMA = { + "type": "array", + "items": { + "type": "object", + "properties": { + "provider_name": {"type": "string"}, + "provider_url": {"type": "string"}, + "endpoints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "schemes": { + "type": "array", + "items": {"type": "string"}, + }, + "url": {"type": "string"}, + "formats": {"type": "array", "items": {"type": "string"}}, + "discovery": {"type": "boolean"}, + }, + "required": ["schemes", "url"], + }, + }, + }, + "required": ["provider_name", "provider_url", "endpoints"], + }, +} diff --git a/synapse/res/providers.json b/synapse/res/providers.json new file mode 100644 index 000000000000..f1838f955901 --- /dev/null +++ b/synapse/res/providers.json @@ -0,0 +1,17 @@ +[ + { + "provider_name": "Twitter", + "provider_url": "http://www.twitter.com/", + "endpoints": [ + { + "schemes": [ + "https://twitter.com/*/status/*", + "https://*.twitter.com/*/status/*", + "https://twitter.com/*/moments/*", + "https://*.twitter.com/*/moments/*" + ], + "url": "https://publish.twitter.com/oembed" + } + ] + } +] \ No newline at end of file diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 2937107d749b..a2472d030f24 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import re -from typing import Optional -from urllib import parse as urlparse +from typing import TYPE_CHECKING, Optional import attr from synapse.http.client import SimpleHttpClient +if TYPE_CHECKING: + from synapse.server import HomeServer + logger = logging.getLogger(__name__) @@ -38,33 +39,6 @@ "http://*.twitter.com/*/moments/*", ], } -# Convert the globs to regular expressions. -_oembed_patterns = {} -for endpoint, globs in _oembed_globs.items(): - for glob in globs: - # Convert the glob into a sane regular expression to match against. The - # rules followed will be slightly different for the domain portion vs. - # the rest. - # - # 1. The scheme must be one of HTTP / HTTPS (and have no globs). - # 2. The domain can have globs, but we limit it to characters that can - # reasonably be a domain part. - # TODO: This does not attempt to handle Unicode domain names. - # 3. Other parts allow a glob to be any one, or more, characters. - results = urlparse.urlparse(glob) - - # Ensure the scheme does not have wildcards (and is a sane scheme). - if results.scheme not in {"http", "https"}: - raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) - - pattern = urlparse.urlunparse( - [ - results.scheme, - re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), - ] - + [re.escape(part).replace("\\*", ".+") for part in results[2:]] - ) - _oembed_patterns[re.compile(pattern)] = endpoint @attr.s(slots=True) @@ -82,7 +56,11 @@ class OEmbedError(Exception): class OEmbedProvider: - def __init__(self, client: SimpleHttpClient): + def __init__(self, hs: "HomeServer", client: SimpleHttpClient): + self._oembed_patterns = {} + for oembed_endpoint in hs.config.oembed.oembed_patterns: + for pattern in oembed_endpoint.url_patterns: + self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint self._client = client def get_oembed_url(self, url: str) -> Optional[str]: @@ -95,7 +73,7 @@ def get_oembed_url(self, url: str) -> Optional[str]: Returns: A URL to use instead or None if the original URL should be used. """ - for url_pattern, endpoint in _oembed_patterns.items(): + for url_pattern, endpoint in self._oembed_patterns.items(): if url_pattern.fullmatch(url): return endpoint diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 96669d92621d..317d333b1238 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -99,7 +99,7 @@ def __init__( self.primary_base_path = media_repo.primary_base_path self.media_storage = media_storage - self._oembed = OEmbedProvider(self.client) + self._oembed = OEmbedProvider(hs, self.client) # We run the background jobs if we're the instance specified (or no # instance is specified, where we assume there is only one instance diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index d56598621717..7fa902722770 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -14,13 +14,14 @@ import json import os import re -from unittest.mock import patch from twisted.internet._resolver import HostResolution from twisted.internet.address import IPv4Address, IPv6Address from twisted.internet.error import DNSLookupError from twisted.test.proto_helpers import AccumulatingProtocol +from synapse.config.oembed import OEmbedEndpointConfig + from tests import unittest from tests.server import FakeTransport @@ -81,6 +82,19 @@ def make_homeserver(self, reactor, clock): hs = self.setup_test_homeserver(config=config) + # After the hs is created, modify the parsed oEmbed config (to avoid + # messing with files). + # + # Note that HTTP URLs are used to avoid having to deal with TLS in tests. + hs.config.oembed.oembed_patterns = [ + OEmbedEndpointConfig( + api_endpoint="http://publish.twitter.com/oembed", + url_patterns=[ + re.compile(r"http://twitter\.com/.+/status/.+"), + ], + ) + ] + return hs def prepare(self, reactor, clock, hs): @@ -544,123 +558,101 @@ def test_accept_language_config_option(self): def test_oembed_photo(self): """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL.""" - # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict( - "synapse.rest.media.v1.oembed._oembed_patterns", - { - re.compile( - r"http://twitter\.com/.+/status/.+" - ): "http://publish.twitter.com/oembed", - }, - clear=True, - ): - - self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] - self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] - - result = { - "version": "1.0", - "type": "photo", - "url": "http://cdn.twitter.com/matrixdotorg", - } - oembed_content = json.dumps(result).encode("utf-8") - - end_content = ( - b"" - b"Some Title" - b'' - b"" - ) + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] - channel = self.make_request( - "GET", - "preview_url?url=http://twitter.com/matrixdotorg/status/12345", - shorthand=False, - await_result=False, - ) - self.pump() - - client = self.reactor.tcpClients[0][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: application/json; charset="utf8"\r\n\r\n' - ) - % (len(oembed_content),) - + oembed_content - ) + result = { + "version": "1.0", + "type": "photo", + "url": "http://cdn.twitter.com/matrixdotorg", + } + oembed_content = json.dumps(result).encode("utf-8") - self.pump() - - client = self.reactor.tcpClients[1][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: text/html; charset="utf8"\r\n\r\n' - ) - % (len(end_content),) - + end_content + end_content = ( + b"" + b"Some Title" + b'' + b"" + ) + + channel = self.make_request( + "GET", + "preview_url?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' ) + % (len(oembed_content),) + + oembed_content + ) - self.pump() + self.pump() - self.assertEqual(channel.code, 200) - self.assertEqual( - channel.json_body, {"og:title": "Some Title", "og:description": "hi"} + client = self.reactor.tcpClients[1][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' ) + % (len(end_content),) + + end_content + ) + + self.pump() + + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": "Some Title", "og:description": "hi"} + ) def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" - # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict( - "synapse.rest.media.v1.oembed._oembed_patterns", - { - re.compile( - r"http://twitter\.com/.+/status/.+" - ): "http://publish.twitter.com/oembed", - }, - clear=True, - ): - - self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] - - result = { - "version": "1.0", - "type": "rich", - "html": "
Content Preview
", - } - end_content = json.dumps(result).encode("utf-8") - - channel = self.make_request( - "GET", - "preview_url?url=http://twitter.com/matrixdotorg/status/12345", - shorthand=False, - await_result=False, - ) - self.pump() - - client = self.reactor.tcpClients[0][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: application/json; charset="utf8"\r\n\r\n' - ) - % (len(end_content),) - + end_content - ) + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + + result = { + "version": "1.0", + "type": "rich", + "html": "
Content Preview
", + } + end_content = json.dumps(result).encode("utf-8") + + channel = self.make_request( + "GET", + "preview_url?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() - self.pump() - self.assertEqual(channel.code, 200) - self.assertEqual( - channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' ) + % (len(end_content),) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, + {"og:title": None, "og:description": "Content Preview"}, + )