Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Avoid storing URL cache files in storage providers #10911

Merged
merged 8 commits into from
Sep 27, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10911.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid storing URL cache files in storage providers.
11 changes: 6 additions & 5 deletions synapse/rest/media/v1/filepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,23 +195,24 @@ def url_cache_thumbnail_rel(

url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)

def url_cache_thumbnail_directory(self, media_id: str) -> str:
def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we're making this change, is it just for consistency?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to be so we can use it in tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, it's for the tests

# Media id is of the form <DATE><RANDOM_STRING>
# E.g.: 2017-09-28-fsdRDt24DS234dsf

if NEW_FORMAT_ID_RE.match(media_id):
return os.path.join(
self.base_path, "url_cache_thumbnails", media_id[:10], media_id[11:]
)
return os.path.join("url_cache_thumbnails", media_id[:10], media_id[11:])
else:
return os.path.join(
self.base_path,
"url_cache_thumbnails",
media_id[0:2],
media_id[2:4],
media_id[4:],
)

url_cache_thumbnail_directory = _wrap_in_base_path(
url_cache_thumbnail_directory_rel
)

def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
"The dirs to try and remove if we delete the media_id thumbnails"
# Media id is of the form <DATE><RANDOM_STRING>
Expand Down
10 changes: 10 additions & 0 deletions synapse/rest/media/v1/storage_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ async def store_file(self, path: str, file_info: FileInfo) -> None:
if file_info.server_name and not self.store_remote:
return None

if file_info.url_cache:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious why you chose to do this here instead of where we make the URL previews? My guess is to ensure we don't break this if we add another way of having URL previews?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little bit, but mostly for existing code. The call hierarchy gets pretty messy above store_into_file and I found it difficult to figure out which paths might touch the url cache. And I didn't want to thread an extra boolean through _generate_thumbnails.

The choice between store_into_file and here was mostly arbitrary, though it seems like this is where we do the existing filtering based on FileInfo.

# The URL preview cache is short lived and not worth offloading or
# backing up.
return None

if self.store_synchronous:
# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
Expand All @@ -110,6 +115,11 @@ async def store() -> None:
run_in_background(store)

async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
if file_info.url_cache:
# Files in the URL preview cache definitely aren't stored here,
# so avoid any potentially slow I/O or network access.
return None

# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
return await maybe_awaitable(self.backend.fetch(path, file_info))
Expand Down
128 changes: 128 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from twisted.test.proto_helpers import AccumulatingProtocol

from synapse.config.oembed import OEmbedEndpointConfig
from synapse.util.stringutils import parse_and_validate_mxc_uri

from tests import unittest
from tests.server import FakeTransport
Expand Down Expand Up @@ -721,3 +722,130 @@ def test_oembed_format(self):
"og:description": "Content Preview",
},
)

def _download_image(self):
"""Downloads an image into the URL cache.

Returns:
A (host, media_id) tuple representing the MXC URI of the image.
"""
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]

channel = self.make_request(
"GET",
"preview_url?url=http://cdn.twitter.com/matrixdotorg",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
% (len(SMALL_PNG),)
+ SMALL_PNG
)

self.pump()
self.assertEqual(channel.code, 200)
body = channel.json_body
mxc_uri = body["og:image"]
host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
self.assertIsNone(_port)
return host, media_id

def test_storage_providers_exclude_files(self):
"""Test that files are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
storage_provider_path = os.path.join(self.storage_path, rel_file_path)

# Check storage
self.assertTrue(os.path.isfile(media_store_path))
self.assertFalse(
os.path.isfile(storage_provider_path),
"URL cache file was unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
os.rename(media_store_path, storage_provider_path)

channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache file was unexpectedly retrieved from a storage provider",
)

def test_storage_providers_exclude_thumbnails(self):
"""Test that thumbnails are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_thumbnail_path = (
self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
)
media_store_thumbnail_path = os.path.join(
self.media_store_path, rel_thumbnail_path
)
storage_provider_thumbnail_path = os.path.join(
self.storage_path, rel_thumbnail_path
)

# Check storage
self.assertTrue(os.path.isdir(media_store_thumbnail_path))
self.assertFalse(
os.path.isdir(storage_provider_thumbnail_path),
"URL cache thumbnails were unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

# Remove the original, otherwise thumbnails will regenerate
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
os.remove(media_store_path)

os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)

channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache thumbnail was unexpectedly retrieved from a storage provider",
)