Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Implement a content type allow list for URL previews #11936

Merged
merged 12 commits into from
Feb 10, 2022
1 change: 1 addition & 0 deletions changelog.d/11936.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers.
18 changes: 18 additions & 0 deletions synapse/http/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
TYPE_CHECKING,
Any,
BinaryIO,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -693,12 +694,18 @@ async def get_file(
output_stream: BinaryIO,
max_size: Optional[int] = None,
headers: Optional[RawHeaders] = None,
is_allowed_content_type: Optional[Callable[[str], bool]] = None,
) -> Tuple[int, Dict[bytes, List[bytes]], str, int]:
"""GETs a file from a given URL
Args:
url: The URL to GET
output_stream: File to write the response body to.
headers: A map from header name to a list of values for that header
is_allowed_content_type: A predicate to determine whether the
content type of the file we're downloading is allowed. If set and
it evaluates to False when called with the content type, the
request will be terminated before completing the download by
raising SynapseError.
Returns:
A tuple of the file length, dict of the response
headers, absolute URI of the response and HTTP response code.
Expand Down Expand Up @@ -726,6 +733,17 @@ async def get_file(
HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN
)

if is_allowed_content_type and b"Content-Type" in resp_headers:
content_type = resp_headers[b"Content-Type"][0].decode("ascii")
if not is_allowed_content_type(content_type):
raise SynapseError(
HTTPStatus.BAD_GATEWAY,
(
"Requested file's content type not allowed for this operation: %s"
% content_type
),
)

# TODO: if our Content-Type is HTML or something, just read the first
# N bytes into RAM rather than saving it all to disk only to read it
# straight back in again
Expand Down
8 changes: 8 additions & 0 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
is_allowed_content_type=_is_previewable,
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
Expand Down Expand Up @@ -761,3 +762,10 @@ def _is_html(content_type: str) -> bool:

def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")


def _is_previewable(content_type: str) -> bool:
"""Returns True for content types for which we will perform URL preview and False
otherwise."""

return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
72 changes: 72 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,78 @@ def test_non_ascii_preview_httpequiv(self):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

def test_video_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

end_content = b"anything"

channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: video/mp4\r\n\r\n"
)
% (len(end_content))
+ end_content
)

self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: video/mp4",
},
)

def test_audio_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

end_content = b"anything"

channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: audio/aac\r\n\r\n"
)
% (len(end_content))
+ end_content
)

self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: audio/aac",
},
)

def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

Expand Down