From d86daaeee7a453e3a4005a79d344c4a1f6d05901 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 9 May 2023 12:34:17 -0400
Subject: [PATCH 1/8] Separate URL checker out to a separate method.

---
 synapse/media/url_previewer.py    |  89 +++++++++++++---------
 tests/media/test_url_previewer.py | 122 ++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 36 deletions(-)
 create mode 100644 tests/media/test_url_previewer.py

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index c8a4a809f129..c5938bde1da6 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -203,42 +203,10 @@ def __init__(
             )
 
     async def preview(self, url: str, user: UserID, ts: int) -> bytes:
-        # XXX: we could move this into _do_preview if we wanted.
-        url_tuple = urlsplit(url)
-        for entry in self.url_preview_url_blacklist:
-            match = True
-            for attrib in entry:
-                pattern = entry[attrib]
-                value = getattr(url_tuple, attrib)
-                logger.debug(
-                    "Matching attrib '%s' with value '%s' against pattern '%s'",
-                    attrib,
-                    value,
-                    pattern,
-                )
-
-                if value is None:
-                    match = False
-                    continue
-
-                # Some attributes might not be parsed as strings by urlsplit (such as the
-                # port, which is parsed as an int). Because we use match functions that
-                # expect strings, we want to make sure that's what we give them.
-                value_str = str(value)
-
-                if pattern.startswith("^"):
-                    if not re.match(pattern, value_str):
-                        match = False
-                        continue
-                else:
-                    if not fnmatch.fnmatch(value_str, pattern):
-                        match = False
-                        continue
-            if match:
-                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
-                raise SynapseError(
-                    403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
-                )
+        if self._is_url_blocked(url):
+            raise SynapseError(
+                403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+            )
 
         # the in-memory cache:
         # * ensures that only one request is active at a time
@@ -411,6 +379,55 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
         return jsonog.encode("utf8")
 
+    def _is_url_blocked(self, url: str) -> bool:
+        """
+        Check whether the URL is allowed to be previewed (according to the homeserver
+        configuration).
+
+        Args:
+            url: The requested URL.
+
+        Return:
+            True if the URL is blocked, False if it is allowed.
+        """
+        # XXX: we could move this into _do_preview if we wanted.
+        url_tuple = urlsplit(url)
+        for entry in self.url_preview_url_blacklist:
+            match = True
+            for attrib in entry:
+                pattern = entry[attrib]
+                value = getattr(url_tuple, attrib)
+                logger.debug(
+                    "Matching attrib '%s' with value '%s' against pattern '%s'",
+                    attrib,
+                    value,
+                    pattern,
+                )
+
+                if value is None:
+                    match = False
+                    continue
+
+                # Some attributes might not be parsed as strings by urlsplit (such as the
+                # port, which is parsed as an int). Because we use match functions that
+                # expect strings, we want to make sure that's what we give them.
+                value_str = str(value)
+
+                if pattern.startswith("^"):
+                    if not re.match(pattern, value_str):
+                        match = False
+                        continue
+                else:
+                    if not fnmatch.fnmatch(value_str, pattern):
+                        match = False
+                        continue
+
+            if match:
+                logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
+                return match
+
+        return False
+
     async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
         """
         Fetches a remote URL and parses the headers.
diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py
new file mode 100644
index 000000000000..e2d67bf7174a
--- /dev/null
+++ b/tests/media/test_url_previewer.py
@@ -0,0 +1,122 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from twisted.test.proto_helpers import MemoryReactor
+
+from synapse.server import HomeServer
+from synapse.util import Clock
+
+from tests import unittest
+from tests.unittest import override_config
+
+try:
+    import lxml
+except ImportError:
+    lxml = None
+
+
+class URLPreviewTests(unittest.HomeserverTestCase):
+    if not lxml:
+        skip = "url preview feature requires lxml"
+
+    hijack_auth = True
+    user_id = "@test:user"
+    end_content = (
+        b"<html><head>"
+        b'<meta property="og:title" content="~matrix~" />'
+        b'<meta property="og:description" content="hi" />'
+        b"</head></html>"
+    )
+
+    def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
+        config = self.default_config()
+        config["url_preview_enabled"] = True
+        config["max_spider_size"] = 9999999
+        config["url_preview_ip_range_blacklist"] = (
+            "192.168.1.1",
+            "1.0.0.0/8",
+            "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+            "2001:800::/21",
+        )
+
+        self.storage_path = self.mktemp()
+        self.media_store_path = self.mktemp()
+        os.mkdir(self.storage_path)
+        os.mkdir(self.media_store_path)
+        config["media_store_path"] = self.media_store_path
+
+        provider_config = {
+            "module": "synapse.media.storage_provider.FileStorageProviderBackend",
+            "store_local": True,
+            "store_synchronous": False,
+            "store_remote": True,
+            "config": {"directory": self.storage_path},
+        }
+
+        config["media_storage_providers"] = [provider_config]
+
+        return self.setup_test_homeserver(config=config)
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        media_repo_resource = hs.get_media_repository_resource()
+        preview_url = media_repo_resource.children[b"preview_url"]
+        self.url_previewer = preview_url._url_previewer
+
+    def test_all_urls_allowed(self) -> None:
+        self.assertFalse(self.url_previewer._is_url_blocked("http://matrix.org"))
+        self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
+        self.assertFalse(self.url_previewer._is_url_blocked("http://localhost:8000"))
+        self.assertFalse(
+            self.url_previewer._is_url_blocked("http://user:pass@matrix.org")
+        )
+
+    @override_config(
+        {
+            "url_preview_url_blacklist": [
+                {"username": "user"},
+                {"scheme": "http", "netloc": "matrix.org"},
+            ]
+        }
+    )
+    def test_blocked_url(self) -> None:
+        # Blocked via scheme and URL.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://matrix.org"))
+        # Not blocked because all components must match.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://matrix.org"))
+
+        # Blocked due to the user.
+        self.assertTrue(
+            self.url_previewer._is_url_blocked("http://user:pass@example.com")
+        )
+        self.assertTrue(self.url_previewer._is_url_blocked("http://user@example.com"))
+
+    @override_config({"url_preview_url_blacklist": [{"netloc": "*.example.com"}]})
+    def test_glob_blocked_url(self) -> None:
+        # All subdomains are blocked.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
+        self.assertTrue(self.url_previewer._is_url_blocked("http://.example.com"))
+
+        # The TLD is not blocked.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))
+
+    @override_config({"url_preview_url_blacklist": [{"netloc": "^.+\\.example\\.com"}]})
+    def test_regex_blocked_urL(self) -> None:
+        # All subdomains are blocked.
+        self.assertTrue(self.url_previewer._is_url_blocked("http://foo.example.com"))
+        # Requires a non-empty subdomain.
+        self.assertFalse(self.url_previewer._is_url_blocked("http://.example.com"))
+
+        # The TLD is not blocked.
+        self.assertFalse(self.url_previewer._is_url_blocked("https://example.com"))

From 03ac76acd466f1cd5c2fb9d5cb524a37c169da3a Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 15 May 2023 15:03:02 -0400
Subject: [PATCH 2/8] Check more URLs against the deny list.

---
 synapse/media/url_previewer.py       |  16 +-
 tests/rest/media/test_url_preview.py | 221 ++++++++++++++++++++++++++-
 2 files changed, 228 insertions(+), 9 deletions(-)

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index c5938bde1da6..966961cd94c2 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -209,7 +209,7 @@ async def preview(self, url: str, user: UserID, ts: int) -> bytes:
             )
 
         # the in-memory cache:
-        # * ensures that only one request is active at a time
+        # * ensures that only one request to a URL is active at a time
         # * takes load off the DB for the thundering herds
         # * also caches any failures (unlike the DB) so we don't keep
         #    requesting the same endpoint
@@ -251,10 +251,10 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 og = og.encode("utf8")
             return og
 
-        # If this URL can be accessed via oEmbed, use that instead.
+        # If this URL can be accessed via an allowed oEmbed, use that instead.
         url_to_download = url
         oembed_url = self._oembed.get_oembed_url(url)
-        if oembed_url:
+        if oembed_url and not self._is_url_blocked(oembed_url):
             url_to_download = oembed_url
 
         media_info = await self._handle_url(url_to_download, user)
@@ -297,7 +297,8 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 # defer to that.
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
-                if oembed_url:
+                # Only download to the oEmbed URL if it is allowed.
+                if oembed_url and not self._is_url_blocked(oembed_url):
                     try:
                         oembed_info = await self._handle_url(
                             oembed_url, user, allow_data_urls=True
@@ -390,7 +391,6 @@ def _is_url_blocked(self, url: str) -> bool:
         Return:
             True if the URL is blocked, False if it is allowed.
         """
-        # XXX: we could move this into _do_preview if we wanted.
         url_tuple = urlsplit(url)
         for entry in self.url_preview_url_blacklist:
             match = True
@@ -422,10 +422,12 @@ def _is_url_blocked(self, url: str) -> bool:
                         match = False
                         continue
 
+            # All fields matched, return true (the URL is blocked).
             if match:
                 logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
                 return match
 
+        # No matches were found, the URL is allowed.
         return False
 
     async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
@@ -640,6 +642,10 @@ async def _precache_image_url(
         if not image_url:
             return
 
+        # Don't attempt to download the URL if it is blocked.
+        if self._is_url_blocked(image_url):
+            return
+
         # The image URL from the HTML might be relative to the previewed page,
         # convert it to an URL which can be requested directly.
         url_parts = urlparse(image_url)
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
index e44beae8c13f..a21369ce238e 100644
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@@ -653,6 +653,57 @@ def test_accept_language_config_option(self) -> None:
             server.data,
         )
 
+    def test_image(self) -> None:
+        """An image should be precached if mentioned in the HTML."""
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["cdn.matrix.org"] = [(IPv4Address, "10.1.2.4")]
+
+        result = (
+            b"""<html><body><img src="http://cdn.matrix.org/foo.png"></body></html>"""
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        # Respond with the HTML.
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+        self.pump()
+
+        # Respond with the photo.
+        client = self.reactor.tcpClients[1][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: image/png\r\n\r\n"
+            )
+            % (len(SMALL_PNG),)
+            + SMALL_PNG
+        )
+        self.pump()
+
+        # The image should be in the result.
+        self.assertEqual(channel.code, 200)
+        self._assert_small_png(channel.json_body)
+
     def test_nonexistent_image(self) -> None:
         """If the preview image doesn't exist, ensure some data is returned."""
         self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
@@ -683,9 +734,53 @@ def test_nonexistent_image(self) -> None:
         )
 
         self.pump()
+
+        # There should not be a second connection.
+        self.assertEqual(len(self.reactor.tcpClients), 1)
+
+        # The image should not be in the result.
         self.assertEqual(channel.code, 200)
+        self.assertNotIn("og:image", channel.json_body)
+
+    @unittest.override_config(
+        {"url_preview_url_blacklist": [{"netloc": "cdn.matrix.org"}]}
+    )
+    def test_image_blocked(self) -> None:
+        """If the preview image doesn't exist, ensure some data is returned."""
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["cdn.matrix.org"] = [(IPv4Address, "10.1.2.4")]
+
+        result = (
+            b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
+        )
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://matrix.org",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+        self.pump()
+
+        # There should not be a second connection.
+        self.assertEqual(len(self.reactor.tcpClients), 1)
 
         # The image should not be in the result.
+        self.assertEqual(channel.code, 200)
         self.assertNotIn("og:image", channel.json_body)
 
     def test_oembed_failure(self) -> None:
@@ -880,6 +975,11 @@ def test_oembed_rich(self) -> None:
         )
 
         self.pump()
+
+        # Double check that the proper host is being connected to. (Note that
+        # twitter.com can't be resolved so this is already implicitly checked.)
+        self.assertIn(b"\r\nHost: publish.twitter.com\r\n", server.data)
+
         self.assertEqual(channel.code, 200)
         body = channel.json_body
         self.assertEqual(
@@ -940,6 +1040,49 @@ def test_oembed_format(self) -> None:
             },
         )
 
+    @unittest.override_config(
+        {"url_preview_url_blacklist": [{"netloc": "publish.twitter.com"}]}
+    )
+    def test_oembed_blocked(self) -> None:
+        """The original URL, not the oEmbed URL, should be used if the oEmbed URL is blocked."""
+        self.lookups["twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        content = b"<html><body>Test</body></html>"
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(content),)
+            + content
+        )
+        self.pump()
+
+        # Double check that the proper host is being connected to. (Note that
+        # publish.twitter.com can't be resolved so this is already implicitly checked.)
+        self.assertIn(b"\r\nHost: twitter.com\r\n", server.data)
+
+        # There should not be a second request being made.
+        self.assertEqual(len(self.reactor.tcpClients), 1)
+
+        # It should be succssful, but there should be no image in the body.
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(body["og:description"], "Test")
+
     def test_oembed_autodiscovery(self) -> None:
         """
         Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
@@ -980,7 +1123,6 @@ def test_oembed_autodiscovery(self) -> None:
             % (len(result),)
             + result
         )
-
         self.pump()
 
         # The oEmbed response.
@@ -1004,7 +1146,6 @@ def test_oembed_autodiscovery(self) -> None:
             % (len(oembed_content),)
             + oembed_content
         )
-
         self.pump()
 
         # Ensure the URL is what was requested.
@@ -1023,7 +1164,6 @@ def test_oembed_autodiscovery(self) -> None:
             % (len(SMALL_PNG),)
             + SMALL_PNG
         )
-
         self.pump()
 
         # Ensure the URL is what was requested.
@@ -1036,6 +1176,59 @@ def test_oembed_autodiscovery(self) -> None:
         )
         self._assert_small_png(body)
 
+    @unittest.override_config(
+        {"url_preview_url_blacklist": [{"netloc": "publish.twitter.com"}]}
+    )
+    def test_oembed_autodiscovery_blocked(self) -> None:
+        """
+        If the discovered oEmbed URL is blocked, it should be discarded.
+        """
+        # This is a little cheesy in that we use the www subdomain (which isn't the
+        # list of oEmbed patterns) to get "raw" HTML response.
+        self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.4")]
+
+        result = b"""
+        <title>Test</title>
+        <link rel="alternate" type="application/json+oembed"
+            href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
+            title="matrixdotorg" />
+        """
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+
+        # Ensure there's no additional connections.
+        self.assertEqual(len(self.reactor.tcpClients), 1)
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"\r\nHost: www.twitter.com\r\n", server.data)
+
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(body["og:title"], "Test")
+        self.assertNotIn("og:image", body)
+
     def _download_image(self) -> Tuple[str, str]:
         """Downloads an image into the URL cache.
         Returns:
@@ -1192,7 +1385,7 @@ def test_cache_expiry(self) -> None:
             )
 
     @unittest.override_config({"url_preview_url_blacklist": [{"port": "*"}]})
-    def test_blacklist_port(self) -> None:
+    def test_blocked_port(self) -> None:
         """Tests that blacklisting URLs with a port makes previewing such URLs
         fail with a 403 error and doesn't impact other previews.
         """
@@ -1230,3 +1423,23 @@ def test_blacklist_port(self) -> None:
 
         self.pump()
         self.assertEqual(channel.code, 200)
+
+    @unittest.override_config(
+        {"url_preview_url_blacklist": [{"netloc": "example.com"}]}
+    )
+    def test_blocked_url(self) -> None:
+        """Tests that blacklisting URLs with a host makes previewing such URLs
+        fail with a 403 error.
+        """
+        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
+
+        bad_url = quote("http://example.com/foo")
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=" + bad_url,
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+        self.assertEqual(channel.code, 403, channel.result)

From 18e2b3bd9a84a0258535a222347fe60d073d1b76 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 15 May 2023 15:06:54 -0400
Subject: [PATCH 3/8] Add comments / minor refactoring.

---
 synapse/media/url_previewer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index 966961cd94c2..d3dc5c437d29 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -394,8 +394,9 @@ def _is_url_blocked(self, url: str) -> bool:
         url_tuple = urlsplit(url)
         for entry in self.url_preview_url_blacklist:
             match = True
-            for attrib in entry:
-                pattern = entry[attrib]
+            # Iterate over each entry. If *all* attributes of that entry match
+            # the current URL, then reject it.
+            for attrib, pattern in entry.items():
                 value = getattr(url_tuple, attrib)
                 logger.debug(
                     "Matching attrib '%s' with value '%s' against pattern '%s'",
@@ -406,21 +407,23 @@ def _is_url_blocked(self, url: str) -> bool:
 
                 if value is None:
                     match = False
-                    continue
+                    break
 
                 # Some attributes might not be parsed as strings by urlsplit (such as the
                 # port, which is parsed as an int). Because we use match functions that
                 # expect strings, we want to make sure that's what we give them.
                 value_str = str(value)
 
+                # Check the value against the pattern as either a regular expression or
+                # a glob. If it doesn't match, the entry doesn't match.
                 if pattern.startswith("^"):
                     if not re.match(pattern, value_str):
                         match = False
-                        continue
+                        break
                 else:
                     if not fnmatch.fnmatch(value_str, pattern):
                         match = False
-                        continue
+                        break
 
             # All fields matched, return true (the URL is blocked).
             if match:

From 66c3a531f1ca1295bbef6ebeb7e51505653a1728 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 15 May 2023 16:00:27 -0400
Subject: [PATCH 4/8] Documentation updates.

---
 synapse/media/url_previewer.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index d3dc5c437d29..023cc527cc8b 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -113,7 +113,7 @@ class UrlPreviewer:
        1. Checks URL and timestamp against the database cache and returns the result if it
           has not expired and was successful (a 2xx return code).
        2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
-          does, update the URL to download.
+          does and the new URL is not blocked, update the URL to download.
        3. Downloads the URL and stores it into a file via the media storage provider
           and saves the local media metadata.
        4. If the media is an image:
@@ -127,14 +127,14 @@ class UrlPreviewer:
                 and saves the local media metadata.
              2. Convert the oEmbed response to an Open Graph response.
              3. Override any Open Graph data from the HTML with data from oEmbed.
-          4. If an image exists in the Open Graph response:
+          4. If an image URL exists in the Open Graph response:
              1. Downloads the URL and stores it into a file via the media storage
                 provider and saves the local media metadata.
              2. Generates thumbnails.
              3. Updates the Open Graph response based on image properties.
-       6. If the media is JSON and an oEmbed URL was found:
+       6. If an oEmbed URL was found and the media is JSON:
           1. Convert the oEmbed response to an Open Graph response.
-          2. If a thumbnail or image is in the oEmbed response:
+          2. If an image URL is in the oEmbed response:
              1. Downloads the URL and stores it into a file via the media storage
                 provider and saves the local media metadata.
              2. Generates thumbnails.
@@ -144,7 +144,8 @@ class UrlPreviewer:
 
     If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
     image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
-    does not fail. As much information as possible is returned.
+    does not fail. If any of them are blocked, then those additional requests
+    are skipped. As much information as possible is returned.
 
     The in-memory cache expires after 1 hour.
 
@@ -212,7 +213,10 @@ async def preview(self, url: str, user: UserID, ts: int) -> bytes:
         # * ensures that only one request to a URL is active at a time
         # * takes load off the DB for the thundering herds
         # * also caches any failures (unlike the DB) so we don't keep
-        #    requesting the same endpoint
+        #   requesting the same endpoint
+        #
+        # Note that autodiscovered oEmbed URLs and pre-caching of images
+        # are not captured in the in-memory cache.
 
         observable = self._cache.get(url)
 

From bf5dfd8e140e03471e9e382b0fdb3bbbf64afcdf Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Mon, 15 May 2023 16:01:58 -0400
Subject: [PATCH 5/8] Newsfragment

---
 changelog.d/15601.bugfix | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/15601.bugfix

diff --git a/changelog.d/15601.bugfix b/changelog.d/15601.bugfix
new file mode 100644
index 000000000000..426db6cea3f4
--- /dev/null
+++ b/changelog.d/15601.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug where the `url_preview_url_blacklist` configuration setting was not applied to oEmbed or image URLs found while previewing a URL.

From f09f3cc2ddd0ed5f7178665e04ba3dd52e0e63cf Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 16 May 2023 07:17:49 -0400
Subject: [PATCH 6/8] Run the URL check on the absolute image URL.

---
 synapse/media/url_previewer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index 023cc527cc8b..20557bccec28 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -649,16 +649,16 @@ async def _precache_image_url(
         if not image_url:
             return
 
-        # Don't attempt to download the URL if it is blocked.
-        if self._is_url_blocked(image_url):
-            return
-
         # The image URL from the HTML might be relative to the previewed page,
-        # convert it to an URL which can be requested directly.
+        # convert it to a URL which can be requested directly.
         url_parts = urlparse(image_url)
         if url_parts.scheme != "data":
             image_url = urljoin(media_info.uri, image_url)
 
+        # Don't attempt to download the URL if it is blocked.
+        if self._is_url_blocked(image_url):
+            return
+
         # FIXME: it might be cleaner to use the same flow as the main /preview_url
         # request itself and benefit from the same caching etc.  But for now we
         # just rely on the caching on the master request to speed things up.

From 6161a3d80c5d2d605308da3ddc7d5fb57d0256b8 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 16 May 2023 07:20:09 -0400
Subject: [PATCH 7/8] Remove unused test code.

---
 tests/media/test_url_previewer.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py
index e2d67bf7174a..3c4c7d676520 100644
--- a/tests/media/test_url_previewer.py
+++ b/tests/media/test_url_previewer.py
@@ -31,15 +31,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
-    hijack_auth = True
-    user_id = "@test:user"
-    end_content = (
-        b"<html><head>"
-        b'<meta property="og:title" content="~matrix~" />'
-        b'<meta property="og:description" content="hi" />'
-        b"</head></html>"
-    )
-
     def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
         config = self.default_config()
         config["url_preview_enabled"] = True

From 8a073a2cd0007c9391008147965cdafa2e80a2b2 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Tue, 16 May 2023 07:33:13 -0400
Subject: [PATCH 8/8] Move logic into _handle_url.

---
 synapse/media/url_previewer.py       | 21 +++++++++----------
 tests/rest/media/test_url_preview.py | 31 ++--------------------------
 2 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py
index 20557bccec28..dbdb1fd20eda 100644
--- a/synapse/media/url_previewer.py
+++ b/synapse/media/url_previewer.py
@@ -204,11 +204,6 @@ def __init__(
             )
 
     async def preview(self, url: str, user: UserID, ts: int) -> bytes:
-        if self._is_url_blocked(url):
-            raise SynapseError(
-                403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
-            )
-
         # the in-memory cache:
         # * ensures that only one request to a URL is active at a time
         # * takes load off the DB for the thundering herds
@@ -258,7 +253,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
         # If this URL can be accessed via an allowed oEmbed, use that instead.
         url_to_download = url
         oembed_url = self._oembed.get_oembed_url(url)
-        if oembed_url and not self._is_url_blocked(oembed_url):
+        if oembed_url:
             url_to_download = oembed_url
 
         media_info = await self._handle_url(url_to_download, user)
@@ -302,7 +297,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
                 # Only download to the oEmbed URL if it is allowed.
-                if oembed_url and not self._is_url_blocked(oembed_url):
+                if oembed_url:
                     try:
                         oembed_info = await self._handle_url(
                             oembed_url, user, allow_data_urls=True
@@ -573,8 +568,16 @@ async def _handle_url(
 
         Returns:
             A MediaInfo object describing the fetched content.
+
+        Raises:
+            SynapseError if the URL is blocked.
         """
 
+        if self._is_url_blocked(url):
+            raise SynapseError(
+                403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
+            )
+
         # TODO: we should probably honour robots.txt... except in practice
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
@@ -655,10 +658,6 @@ async def _precache_image_url(
         if url_parts.scheme != "data":
             image_url = urljoin(media_info.uri, image_url)
 
-        # Don't attempt to download the URL if it is blocked.
-        if self._is_url_blocked(image_url):
-            return
-
         # FIXME: it might be cleaner to use the same flow as the main /preview_url
         # request itself and benefit from the same caching etc.  But for now we
         # just rely on the caching on the master request to speed things up.
diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py
index a21369ce238e..7517155cf335 100644
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@@ -1044,11 +1044,9 @@ def test_oembed_format(self) -> None:
         {"url_preview_url_blacklist": [{"netloc": "publish.twitter.com"}]}
     )
     def test_oembed_blocked(self) -> None:
-        """The original URL, not the oEmbed URL, should be used if the oEmbed URL is blocked."""
+        """The oEmbed URL should not be downloaded if the oEmbed URL is blocked."""
         self.lookups["twitter.com"] = [(IPv4Address, "10.1.2.3")]
 
-        content = b"<html><body>Test</body></html>"
-
         channel = self.make_request(
             "GET",
             "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
@@ -1056,32 +1054,7 @@ def test_oembed_blocked(self) -> None:
             await_result=False,
         )
         self.pump()
-
-        client = self.reactor.tcpClients[0][2].buildProtocol(None)
-        server = AccumulatingProtocol()
-        server.makeConnection(FakeTransport(client, self.reactor))
-        client.makeConnection(FakeTransport(server, self.reactor))
-        client.dataReceived(
-            (
-                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
-            )
-            % (len(content),)
-            + content
-        )
-        self.pump()
-
-        # Double check that the proper host is being connected to. (Note that
-        # publish.twitter.com can't be resolved so this is already implicitly checked.)
-        self.assertIn(b"\r\nHost: twitter.com\r\n", server.data)
-
-        # There should not be a second request being made.
-        self.assertEqual(len(self.reactor.tcpClients), 1)
-
-        # It should be succssful, but there should be no image in the body.
-        self.assertEqual(channel.code, 200)
-        body = channel.json_body
-        self.assertEqual(body["og:description"], "Test")
+        self.assertEqual(channel.code, 403, channel.result)
 
     def test_oembed_autodiscovery(self) -> None:
         """