From 7dd0c1730a1ea5962a77b9bbb883c1690b25b686 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 24 Jan 2016 18:47:27 -0500 Subject: [PATCH 01/40] initial WIP of a tentative preview_url endpoint - incomplete, untested, experimental, etc. just putting it here for safekeeping for now --- docs/url_previews.rst | 74 ++++++++ synapse/config/repository.py | 6 +- synapse/http/client.py | 81 +++++++++ synapse/rest/media/v1/media_repository.py | 3 + synapse/rest/media/v1/preview_url_resource.py | 164 ++++++++++++++++++ 5 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 docs/url_previews.rst create mode 100644 synapse/rest/media/v1/preview_url_resource.py diff --git a/docs/url_previews.rst b/docs/url_previews.rst new file mode 100644 index 000000000000..1dc6ee0c45cf --- /dev/null +++ b/docs/url_previews.rst @@ -0,0 +1,74 @@ +URL Previews +============ + +Design notes on a URL previewing service for Matrix: + +Options are: + + 1. Have an AS which listens for URLs, downloads them, and inserts an event that describes their metadata. + * Pros: + * Decouples the implementation entirely from Synapse. + * Uses existing Matrix events & content repo to store the metadata. + * Cons: + * Which AS should provide this service for a room, and why should you trust it? + * Doesn't work well with E2E; you'd have to cut the AS into every room + * the AS would end up subscribing to every room anyway. + + 2. Have a generic preview API (nothing to do with Matrix) that provides a previewing service: + * Pros: + * Simple and flexible; can be used by any clients at any point + * Cons: + * If each HS provides one of these independently, all the HSes in a room may needlessly DoS the target URI + * We need somewhere to store the URL metadata rather than just using Matrix itself + * We can't piggyback on matrix to distribute the metadata between HSes. + + 3. Make the synapse of the sending user responsible for spidering the URL and inserting an event asynchronously which describes the metadata. + * Pros: + * Works transparently for all clients + * Piggy-backs nicely on using Matrix for distributing the metadata. + * No confusion as to which AS + * Cons: + * Doesn't work with E2E + * We might want to decouple the implementation of the spider from the HS, given spider behaviour can be quite complicated and evolve much more rapidly than the HS. It's more like a bot than a core part of the server. + + 4. Make the sending client use the preview API and insert the event itself when successful. + * Pros: + * Works well with E2E + * No custom server functionality + * Lets the client customise the preview that they send (like on FB) + * Cons: + * Entirely specific to the sending client, whereas it'd be nice if /any/ URL was correctly previewed if clients support it. + + 5. Have the option of specifying a shared (centralised) previewing service used by a room, to avoid all the different HSes in the room DoSing the target. + +Best solution is probably a combination of both 2 and 4. + * Sending clients do their best to create and send a preview at the point of sending the message, perhaps delaying the message until the preview is computed? (This also lets the user validate the preview before sending) + * Receiving clients have the option of going and creating their own preview if one doesn't arrive soon enough (or if the original sender didn't create one) + +This is a bit magical though in that the preview could come from two entirely different sources - the sending HS or your local one. However, this can always be exposed to users: "Generate your own URL previews if none are available?" + +This is tantamount also to senders calculating their own thumbnails for sending in advance of the main content - we are trusting the sender not to lie about the content in the thumbnail. Whereas currently thumbnails are calculated by the receiving homeserver to avoid this attack. + +However, this kind of phishing attack does exist whether we let senders pick their thumbnails or not, in that a malicious sender can send normal text messages around the attachment claiming it to be legitimate. We could rely on (future) reputation/abuse management to punish users who phish (be it with bogus metadata or bogus descriptions). Bogus metadata is particularly bad though, especially if it's avoidable. + +As a first cut, let's do #2 and have the receiver hit the API to calculate its own previews (as it does currently for image thumbnails). We can then extend/optimise this to option 4 as a special extra if needed. + +API +--- + +GET /_matrix/media/r0/previewUrl?url=http://wherever.com +200 OK +{ + "og:type" : "article" + "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + "og:title" : "Matrix on Twitter" + "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + "og:site_name" : "Twitter" +} + +* Downloads the URL + * If HTML, just stores it in RAM and parses it for OG meta tags + * Download any media OG meta tags to the media repo, and refer to them in the OG via mxc:// URIs. + * If a media filetype we know we can thumbnail: store it on disk, and hand it to the thumbnailer. Generate OG meta tags from the thumbnailer contents. + * Otherwise, don't bother downloading further. diff --git a/synapse/config/repository.py b/synapse/config/repository.py index 2fcf872449c1..33fff5616d30 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -53,6 +53,7 @@ class ContentRepositoryConfig(Config): def read_config(self, config): self.max_upload_size = self.parse_size(config["max_upload_size"]) self.max_image_pixels = self.parse_size(config["max_image_pixels"]) + self.max_spider_size = self.parse_size(config["max_spider_size"]) self.media_store_path = self.ensure_directory(config["media_store_path"]) self.uploads_path = self.ensure_directory(config["uploads_path"]) self.dynamic_thumbnails = config["dynamic_thumbnails"] @@ -73,6 +74,9 @@ def default_config(self, **kwargs): # The largest allowed upload size in bytes max_upload_size: "10M" + # The largest allowed URL preview spidering size in bytes + max_spider_size: "10M" + # Maximum number of pixels that will be thumbnailed max_image_pixels: "32M" @@ -80,7 +84,7 @@ def default_config(self, **kwargs): # the resolution requested by the client. If true then whenever # a new resolution is requested by the client the server will # generate a new thumbnail. If false the server will pick a thumbnail - # from a precalcualted list. + # from a precalculated list. dynamic_thumbnails: false # List of thumbnail to precalculate when an image is uploaded. diff --git a/synapse/http/client.py b/synapse/http/client.py index fdd90b1c3c7b..25d319f126cb 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -238,6 +238,87 @@ def get_raw(self, uri, args={}): else: raise CodeMessageException(response.code, body) + # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. + # The two should be factored out. + + @defer.inlineCallbacks + def get_file(self, url, output_stream, args={}, max_size=None): + """GETs a file from a given URL + Args: + url (str): The URL to GET + output_stream (file): File to write the response body to. + Returns: + A (int,dict) tuple of the file length and a dict of the response + headers. + """ + + def body_callback(method, url_bytes, headers_dict): + self.sign_request(destination, method, url_bytes, headers_dict) + return None + + response = yield self.request( + "GET", + url.encode("ascii"), + headers=Headers({ + b"User-Agent": [self.user_agent], + }) + ) + + headers = dict(response.headers.getAllRawHeaders()) + + if headers['Content-Length'] > max_size: + logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) + # XXX: do we want to explicitly drop the connection here somehow? if so, how? + raise # what should we be raising here? + + # TODO: if our Content-Type is HTML or something, just read the first + # N bytes into RAM rather than saving it all to disk only to read it + # straight back in again + + try: + length = yield preserve_context_over_fn( + _readBodyToFile, + response, output_stream, max_size + ) + except: + logger.exception("Failed to download body") + raise + + defer.returnValue((length, headers)) + + +# XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. +# The two should be factored out. + +class _ReadBodyToFileProtocol(protocol.Protocol): + def __init__(self, stream, deferred, max_size): + self.stream = stream + self.deferred = deferred + self.length = 0 + self.max_size = max_size + + def dataReceived(self, data): + self.stream.write(data) + self.length += len(data) + if self.max_size is not None and self.length >= self.max_size: + logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) + self.deferred = defer.Deferred() + self.transport.loseConnection() + + def connectionLost(self, reason): + if reason.check(ResponseDone): + self.deferred.callback(self.length) + else: + self.deferred.errback(reason) + + +# XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. +# The two should be factored out. + +def _readBodyToFile(response, stream, max_size): + d = defer.Deferred() + response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size)) + return d class CaptchaServerHttpClient(SimpleHttpClient): """ diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 7dfb027dd11e..8f3491b91c66 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -17,6 +17,7 @@ from .download_resource import DownloadResource from .thumbnail_resource import ThumbnailResource from .identicon_resource import IdenticonResource +from .preview_url_resource import PreviewUrlResource from .filepath import MediaFilePaths from twisted.web.resource import Resource @@ -78,3 +79,5 @@ def __init__(self, hs): self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) + self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py new file mode 100644 index 000000000000..fb8ab3096fe7 --- /dev/null +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -0,0 +1,164 @@ +# Copyright 2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from twisted.web.resource import Resource +from lxml import html +from synapse.http.client import SimpleHttpClient +from synapse.http.server import respond_with_json_bytes +from simplejson import json + +import logging +logger = logging.getLogger(__name__) + +class PreviewUrlResource(Resource): + isLeaf = True + + def __init__(self, hs, filepaths): + Resource.__init__(self) + self.client = SimpleHttpClient(hs) + self.filepaths = filepaths + self.max_spider_size = hs.config.max_spider_size + self.server_name = hs.hostname + self.clock = hs.get_clock() + + def render_GET(self, request): + self._async_render_GET(request) + return NOT_DONE_YET + + @request_handler + @defer.inlineCallbacks + def _async_render_GET(self, request): + url = request.args.get("url") + + try: + # TODO: keep track of whether there's an ongoing request for this preview + # and block and return their details if there is one. + + media_info = self._download_url(url) + except: + os.remove(fname) + raise + + if self._is_media(media_type): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info.media_type, + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_type): + tree = html.parse(media_info.filename) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + + def _download_url(url): + requester = yield self.auth.get_user_by_req(request) + + # XXX: horrible duplication with base_resource's _download_remote_file() + file_id = random_string(24) + + fname = self.filepaths.local_media_filepath(file_id) + self._makedirs(fname) + + try: + with open(fname, "wb") as f: + length, headers = yield self.client.get_file( + url, output_stream=f, max_size=self.max_spider_size, + ) + media_type = headers["Content-Type"][0] + time_now_ms = self.clock.time_msec() + + content_disposition = headers.get("Content-Disposition", None) + if content_disposition: + _, params = cgi.parse_header(content_disposition[0],) + download_name = None + + # First check if there is a valid UTF-8 filename + download_name_utf8 = params.get("filename*", None) + if download_name_utf8: + if download_name_utf8.lower().startswith("utf-8''"): + download_name = download_name_utf8[7:] + + # If there isn't check for an ascii name. + if not download_name: + download_name_ascii = params.get("filename", None) + if download_name_ascii and is_ascii(download_name_ascii): + download_name = download_name_ascii + + if download_name: + download_name = urlparse.unquote(download_name) + try: + download_name = download_name.decode("utf-8") + except UnicodeDecodeError: + download_name = None + else: + download_name = None + + yield self.store.store_local_media( + media_id=fname, + media_type=media_type, + time_now_ms=self.clock.time_msec(), + upload_name=download_name, + media_length=length, + user_id=requester.user, + ) + + except: + os.remove(fname) + raise + + return { + "media_type": media_type, + "media_length": length, + "download_name": download_name, + "created_ts": time_now_ms, + "filesystem_id": file_id, + "filename": fname, + } + + + def _is_media(content_type): + if content_type.lower().startswith("image/"): + return True + + def _is_html(content_type): + content_type = content_type.lower() + if content_type == "text/html" or + content_type.startswith("application/xhtml"): + return True From adafa24b0a8f539c114c7d45f36f7b62743557f6 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 25 Mar 2016 23:38:19 +0000 Subject: [PATCH 02/40] typo --- synapse/replication/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/replication/resource.py b/synapse/replication/resource.py index 8c1ae0fbc7f5..37a1d3960c48 100644 --- a/synapse/replication/resource.py +++ b/synapse/replication/resource.py @@ -76,7 +76,7 @@ class ReplicationResource(Resource): The response is a JSON object with keys for each stream with updates. Under each key is a JSON object with: - * "postion": The current position of the stream. + * "position": The current position of the stream. * "field_names": The names of the fields in each row. * "rows": The updates as an array of arrays. From ec0cf996c94cb11f2a9b51369b886fb275b26ee5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 25 Mar 2016 23:38:19 +0000 Subject: [PATCH 03/40] typo --- synapse/replication/resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/replication/resource.py b/synapse/replication/resource.py index 8c1ae0fbc7f5..37a1d3960c48 100644 --- a/synapse/replication/resource.py +++ b/synapse/replication/resource.py @@ -76,7 +76,7 @@ class ReplicationResource(Resource): The response is a JSON object with keys for each stream with updates. Under each key is a JSON object with: - * "postion": The current position of the stream. + * "position": The current position of the stream. * "field_names": The names of the fields in each row. * "rows": The updates as an array of arrays. From dd4287ca5d0c3e3df566748e0dd6ab36398f64b4 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 02:07:57 +0100 Subject: [PATCH 04/40] make it build --- synapse/http/client.py | 2 +- synapse/python_dependencies.py | 1 + synapse/rest/media/v1/preview_url_resource.py | 17 +++++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 127690e53474..a735300db0d1 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -21,7 +21,7 @@ from canonicaljson import encode_canonical_json -from twisted.internet import defer, reactor, ssl +from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( Agent, readBody, FileBodyProducer, PartialDownloadError, ) diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 0a6043ae8df5..d12ef1504318 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -36,6 +36,7 @@ "blist": ["blist"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pymacaroons-pynacl": ["pymacaroons"], + "lxml>=3.6.0": ["lxml"], } CONDITIONAL_REQUIREMENTS = { "web_client": { diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index fb8ab3096fe7..5c8e20e23c67 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -13,10 +13,11 @@ # limitations under the License. from twisted.web.resource import Resource +from twisted.internet import defer from lxml import html from synapse.http.client import SimpleHttpClient -from synapse.http.server import respond_with_json_bytes -from simplejson import json +from synapse.http.server import request_handler, respond_with_json_bytes +import ujson as json import logging logger = logging.getLogger(__name__) @@ -75,7 +76,7 @@ def _async_render_GET(self, request): # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" # "og:title" : "Matrix on Twitter" # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" # "og:site_name" : "Twitter" og = {} @@ -143,15 +144,15 @@ def _download_url(url): os.remove(fname) raise - return { + yield ({ "media_type": media_type, "media_length": length, "download_name": download_name, "created_ts": time_now_ms, "filesystem_id": file_id, "filename": fname, - } - + }) + return def _is_media(content_type): if content_type.lower().startswith("image/"): @@ -159,6 +160,6 @@ def _is_media(content_type): def _is_html(content_type): content_type = content_type.lower() - if content_type == "text/html" or - content_type.startswith("application/xhtml"): + if (content_type == "text/html" or + content_type.startswith("application/xhtml")): return True From 64b4aead15927be56d7433250462c03f2d1f4565 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:13:25 +0100 Subject: [PATCH 05/40] make it work --- docs/url_previews.rst | 2 +- synapse/http/client.py | 3 +- synapse/rest/media/v1/base_resource.py | 1 + synapse/rest/media/v1/preview_url_resource.py | 131 ++++++++++-------- 4 files changed, 80 insertions(+), 57 deletions(-) diff --git a/docs/url_previews.rst b/docs/url_previews.rst index 1dc6ee0c45cf..634d9d907f12 100644 --- a/docs/url_previews.rst +++ b/docs/url_previews.rst @@ -56,7 +56,7 @@ As a first cut, let's do #2 and have the receiver hit the API to calculate its o API --- -GET /_matrix/media/r0/previewUrl?url=http://wherever.com +GET /_matrix/media/r0/preview_url?url=http://wherever.com 200 OK { "og:type" : "article" diff --git a/synapse/http/client.py b/synapse/http/client.py index a735300db0d1..cfdea91b5728 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -26,6 +26,7 @@ Agent, readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http_headers import Headers +from twisted.web._newclient import ResponseDone from StringIO import StringIO @@ -266,7 +267,7 @@ def body_callback(method, url_bytes, headers_dict): headers = dict(response.headers.getAllRawHeaders()) - if headers['Content-Length'] > max_size: + if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? diff --git a/synapse/rest/media/v1/base_resource.py b/synapse/rest/media/v1/base_resource.py index 58ef91c0b87f..2b1938dc8e4c 100644 --- a/synapse/rest/media/v1/base_resource.py +++ b/synapse/rest/media/v1/base_resource.py @@ -72,6 +72,7 @@ def __init__(self, hs, filepaths): self.store = hs.get_datastore() self.max_upload_size = hs.config.max_upload_size self.max_image_pixels = hs.config.max_image_pixels + self.max_spider_size = hs.config.max_spider_size self.filepaths = filepaths self.version_string = hs.version_string self.downloads = {} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 5c8e20e23c67..408b103367db 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .base_resource import BaseMediaResource +from synapse.api.errors import Codes from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from synapse.util.stringutils import random_string from synapse.http.client import SimpleHttpClient -from synapse.http.server import request_handler, respond_with_json_bytes +from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes + +import os import ujson as json import logging logger = logging.getLogger(__name__) -class PreviewUrlResource(Resource): +class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): - Resource.__init__(self) + BaseMediaResource.__init__(self, hs, filepaths) self.client = SimpleHttpClient(hs) - self.filepaths = filepaths - self.max_spider_size = hs.config.max_spider_size - self.server_name = hs.hostname - self.clock = hs.get_clock() def render_GET(self, request): self._async_render_GET(request) @@ -40,57 +42,76 @@ def render_GET(self, request): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - url = request.args.get("url") try: + # XXX: if get_user_by_req fails, what should we do in an async render? + requester = yield self.auth.get_user_by_req(request) + url = request.args.get("url")[0] + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. - media_info = self._download_url(url) + media_info = yield self._download_url(url, requester.user) + + logger.warn("got media_info of '%s'" % media_info) + + if self._is_media(media_info['media_type']): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info['media_type'], + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_info['media_type']): + tree = html.parse(media_info['filename']) + logger.warn(html.tostring(tree)) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + else: + logger.warn("Failed to find any OG data in %s", url) + og = {} + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - os.remove(fname) + # XXX: if we don't explicitly respond here, the request never returns. + # isn't this what server.py's wrapper is meant to be doing for us? + respond_with_json( + request, + 500, + { + "error": "Internal server error", + "errcode": Codes.UNKNOWN, + }, + send_cors=True + ) raise - if self._is_media(media_type): - dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info - ) - - og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), - "og:image:type" : media_info.media_type, - "og:image:width" : dims.width, - "og:image:height" : dims.height, - } - - # define our OG response for this media - elif self._is_html(media_type): - tree = html.parse(media_info.filename) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - - def _download_url(url): - requester = yield self.auth.get_user_by_req(request) + @defer.inlineCallbacks + def _download_url(self, url, user): # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) @@ -99,6 +120,7 @@ def _download_url(url): try: with open(fname, "wb") as f: + logger.warn("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) @@ -137,14 +159,14 @@ def _download_url(url): time_now_ms=self.clock.time_msec(), upload_name=download_name, media_length=length, - user_id=requester.user, + user_id=user, ) except: os.remove(fname) raise - yield ({ + defer.returnValue({ "media_type": media_type, "media_length": length, "download_name": download_name, @@ -152,14 +174,13 @@ def _download_url(url): "filesystem_id": file_id, "filename": fname, }) - return - def _is_media(content_type): + def _is_media(self, content_type): if content_type.lower().startswith("image/"): return True - def _is_html(content_type): + def _is_html(self, content_type): content_type = content_type.lower() - if (content_type == "text/html" or + if (content_type.startswith("text/html") or content_type.startswith("application/xhtml")): return True From 19038582d3957eef2b662d28035361ecf9d3a84e Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:14:16 +0100 Subject: [PATCH 06/40] debug --- synapse/rest/media/v1/preview_url_resource.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 408b103367db..4f7c9e3d1b9f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -53,7 +53,7 @@ def _async_render_GET(self, request): media_info = yield self._download_url(url, requester.user) - logger.warn("got media_info of '%s'" % media_info) + logger.debug("got media_info of '%s'" % media_info) if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( @@ -71,7 +71,6 @@ def _async_render_GET(self, request): # define our OG response for this media elif self._is_html(media_info['media_type']): tree = html.parse(media_info['filename']) - logger.warn(html.tostring(tree)) # suck it up into lxml and define our OG response. # if we see any URLs in the OG response, then spider them @@ -120,7 +119,7 @@ def _download_url(self, url, user): try: with open(fname, "wb") as f: - logger.warn("Trying to get url '%s'" % url) + logger.debug("Trying to get url '%s'" % url) length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) From 721b2bfa851bcf91948e166587dce4da666739b1 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:32:52 +0100 Subject: [PATCH 07/40] implement redirects --- synapse/http/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index cfdea91b5728..71b2e3375eaf 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,7 +23,7 @@ from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - Agent, readBody, FileBodyProducer, PartialDownloadError, + RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http_headers import Headers from twisted.web._newclient import ResponseDone @@ -59,11 +59,11 @@ def __init__(self, hs): # The default context factory in Twisted 14.0.0 (which we require) is # BrowserLikePolicyForHTTPS which will do regular cert validation # 'like a browser' - self.agent = Agent( + self.agent = RedirectAgent(Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - ) + )) self.user_agent = hs.version_string if hs.config.user_agent_suffix: self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,) From ae5831d30354c713cd1693f3b74cf048de7428a7 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Tue, 29 Mar 2016 03:32:55 +0100 Subject: [PATCH 08/40] fix bugs --- synapse/rest/media/v1/preview_url_resource.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 4f7c9e3d1b9f..b999944e86e6 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -57,15 +57,15 @@ def _async_render_GET(self, request): if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( - media_info.filesystem_id, media_info + media_info['filesystem_id'], media_info ) og = { - "og:description" : media_info.download_name, - "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:description" : media_info['download_name'], + "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), "og:image:type" : media_info['media_type'], - "og:image:width" : dims.width, - "og:image:height" : dims.height, + "og:image:width" : dims['width'], + "og:image:height" : dims['height'], } # define our OG response for this media @@ -123,6 +123,7 @@ def _download_url(self, url, user): length, headers = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) + # FIXME: handle 404s sanely - don't spider an error page media_type = headers["Content-Type"][0] time_now_ms = self.clock.time_msec() From 7178ab7da069708172d0bf8222b3e0a0daf6a090 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Wed, 30 Mar 2016 17:29:22 +0100 Subject: [PATCH 09/40] spell out more packages --- README.rst | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 285fc5aa8a7d..6136e0c1fe91 100644 --- a/README.rst +++ b/README.rst @@ -118,7 +118,6 @@ Installing prerequisites on CentOS 7:: python-virtualenv libffi-devel openssl-devel sudo yum groupinstall "Development Tools" - Installing prerequisites on Mac OS X:: xcode-select --install @@ -150,12 +149,7 @@ In case of problems, please see the _Troubleshooting section below. Alternatively, Silvio Fricke has contributed a Dockerfile to automate the above in Docker at https://registry.hub.docker.com/u/silviof/docker-matrix/. -Another alternative is to install via apt from http://matrix.org/packages/debian/. -Note that these packages do not include a client - choose one from -https://matrix.org/blog/try-matrix-now/ (or build your own with -https://github.com/matrix-org/matrix-js-sdk/). - -Finally, Martin Giess has created an auto-deployment process with vagrant/ansible, +Also, Martin Giess has created an auto-deployment process with vagrant/ansible, tested with VirtualBox/AWS/DigitalOcean - see https://github.com/EMnify/matrix-synapse-auto-deploy for details. @@ -229,6 +223,19 @@ For information on how to install and use PostgreSQL, please see Platform Specific Instructions ============================== +Debian +------ + +Matrix provides official Debian packages via apt from http://matrix.org/packages/debian/. +Note that these packages do not include a client - choose one from +https://matrix.org/blog/try-matrix-now/ (or build your own with one of our SDKs :) + +Fedora +------ + +Oleg Girko provides Fedora RPMs at +https://obs.infoserver.lv/project/monitor/matrix-synapse + ArchLinux --------- @@ -270,11 +277,17 @@ During setup of Synapse you need to call python2.7 directly again:: FreeBSD ------- -Synapse can be installed via FreeBSD Ports or Packages: +Synapse can be installed via FreeBSD Ports or Packages contributed by Brendan Molloy from: - Ports: ``cd /usr/ports/net/py-matrix-synapse && make install clean`` - Packages: ``pkg install py27-matrix-synapse`` +NixOS +----- + +Robin Lambertz has packaged Synapse for NixOS at: +https://github.com/NixOS/nixpkgs/blob/master/nixos/modules/services/misc/matrix-synapse.nix + Windows Install --------------- Synapse can be installed on Cygwin. It requires the following Cygwin packages: From a8a5dd3b44a4526307502bd621ee0bd43c87c77f Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 01:55:21 +0100 Subject: [PATCH 10/40] handle requests with missing content-length headers (e.g. YouTube) --- synapse/http/client.py | 33 +++++++++++++++---- synapse/rest/media/v1/preview_url_resource.py | 4 +-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 71b2e3375eaf..30f31a915d3d 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,8 +23,9 @@ from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, + BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, ) +from twisted.web.http import PotentialDataLoss from twisted.web.http_headers import Headers from twisted.web._newclient import ResponseDone @@ -59,11 +60,11 @@ def __init__(self, hs): # The default context factory in Twisted 14.0.0 (which we require) is # BrowserLikePolicyForHTTPS which will do regular cert validation # 'like a browser' - self.agent = RedirectAgent(Agent( + self.agent = Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - )) + ) self.user_agent = hs.version_string if hs.config.user_agent_suffix: self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,) @@ -253,10 +254,6 @@ def get_file(self, url, output_stream, args={}, max_size=None): headers. """ - def body_callback(method, url_bytes, headers_dict): - self.sign_request(destination, method, url_bytes, headers_dict) - return None - response = yield self.request( "GET", url.encode("ascii"), @@ -309,6 +306,10 @@ def dataReceived(self, data): def connectionLost(self, reason): if reason.check(ResponseDone): self.deferred.callback(self.length) + elif reason.check(PotentialDataLoss): + # stolen from https://github.com/twisted/treq/pull/49/files + # http://twistedmatrix.com/trac/ticket/4840 + self.deferred.callback(self.length) else: self.deferred.errback(reason) @@ -350,6 +351,24 @@ def post_urlencoded_get_raw(self, url, args={}): # twisted dislikes google's response, no content length. defer.returnValue(e.response) +class SpiderHttpClient(SimpleHttpClient): + """ + Separate HTTP client for spidering arbitrary URLs. + Special in that it follows retries and has a UA that looks + like a browser. + + used by the preview_url endpoint in the content repo. + """ + def __init__(self, hs): + SimpleHttpClient.__init__(self, hs) + # clobber the base class's agent and UA: + self.agent = BrowserLikeRedirectAgent(Agent( + reactor, + connectTimeout=15, + contextFactory=hs.get_http_client_context_factory() + )) + # Look like Chrome for now + #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) def encode_urlencode_args(args): return {k: encode_urlencode_arg(v) for k, v in args.items()} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b999944e86e6..ca2529cc1037 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -19,7 +19,7 @@ from twisted.internet import defer from lxml import html from synapse.util.stringutils import random_string -from synapse.http.client import SimpleHttpClient +from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes import os @@ -33,7 +33,7 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) - self.client = SimpleHttpClient(hs) + self.client = SpiderHttpClient(hs) def render_GET(self, request): self._async_render_GET(request) From 0d3d7de6fcb98972532bf9aaa983ddd8befb3db8 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 12:42:27 +0100 Subject: [PATCH 11/40] sync in changes from matrixfederationclient --- synapse/http/client.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 30f31a915d3d..219b73426850 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -244,7 +244,7 @@ def get_raw(self, uri, args={}): # The two should be factored out. @defer.inlineCallbacks - def get_file(self, url, output_stream, args={}, max_size=None): + def get_file(self, url, output_stream, max_size=None): """GETs a file from a given URL Args: url (str): The URL to GET @@ -299,7 +299,11 @@ def dataReceived(self, data): self.stream.write(data) self.length += len(data) if self.max_size is not None and self.length >= self.max_size: - logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) + self.deferred.errback(SynapseError( + 502, + "Requested file is too large > %r bytes" % (self.max_size,), + Codes.TOO_LARGE, + )) self.deferred = defer.Deferred() self.transport.loseConnection() From bb9a2ca87c280e1c6ff6740ee9d2764e1b5226a5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 14:15:09 +0100 Subject: [PATCH 12/40] synthesise basig OG metadata from pages lacking it --- synapse/rest/media/v1/preview_url_resource.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index ca2529cc1037..b1d5cabfaaab 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -23,6 +23,7 @@ from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes import os +import re import ujson as json import logging @@ -70,6 +71,7 @@ def _async_render_GET(self, request): # define our OG response for this media elif self._is_html(media_info['media_type']): + # TODO: somehow stop a big HTML tree from exploding synapse's RAM tree = html.parse(media_info['filename']) # suck it up into lxml and define our OG response. @@ -82,17 +84,58 @@ def _async_render_GET(self, request): # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): og[tag.attrib['property']] = tag.attrib['content'] + if not og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text if title else None + + images = tree.xpath("//img") + big_images = [ i for i in images if ( + 'width' in i and 'height' in i and + i.attrib['width'] > 64 and i.attrib['height'] > 64 + )] or images + og['og:image'] = images[0].attrib['src'] if images else None + + text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + text = '' + for text_node in text_nodes: + if len(text) < 1024: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:1024] + og['og:description'] = text if text else None + + # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too # TODO: store our OG details in a cache (and expire them when stale) # TODO: delete the content to stop diskfilling, as we only ever cared about its OG else: logger.warn("Failed to find any OG data in %s", url) og = {} + logger.warn(og) + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: # XXX: if we don't explicitly respond here, the request never returns. @@ -111,6 +154,10 @@ def _async_render_GET(self, request): @defer.inlineCallbacks def _download_url(self, url, user): + # TODO: we should probably honour robots.txt... except in practice + # we're most likely being explicitly triggered by a human rather than a + # bot, so are we really a robot? + # XXX: horrible duplication with base_resource's _download_remote_file() file_id = random_string(24) From 72550c3803e5020aa377f8d10c0c20afd4273c0d Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 15:14:14 +0100 Subject: [PATCH 13/40] prevent choking on invalid utf-8, and handle image thumbnailing smarter --- synapse/rest/media/v1/preview_url_resource.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b1d5cabfaaab..04d02ee42796 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -72,7 +72,15 @@ def _async_render_GET(self, request): # define our OG response for this media elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - tree = html.parse(media_info['filename']) + + # XXX: can't work out how to make lxml ignore UTF8 decoding errors + # so slurp as a string at this point. + file = open(media_info['filename']) + body = file.read() + file.close() + # FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8... + tree = html.fromstring(body.decode('utf-8','ignore')) + # tree = html.parse(media_info['filename']) # suck it up into lxml and define our OG response. # if we see any URLs in the OG response, then spider them @@ -108,14 +116,19 @@ def _async_render_GET(self, request): title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") og['og:title'] = title[0].text if title else None - images = tree.xpath("//img") + images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] big_images = [ i for i in images if ( - 'width' in i and 'height' in i and + 'width' in i.attrib and 'height' in i.attrib and i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] or images - og['og:image'] = images[0].attrib['src'] if images else None + )] + big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = big_images if big_images else images + + if images: + og['og:image'] = images[0].attrib['src'] text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") text = '' for text_node in text_nodes: if len(text) < 1024: From 683e564815be5f7852c417cbab06876db6122401 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 23:52:58 +0100 Subject: [PATCH 14/40] handle spidered relative images correctly --- synapse/http/client.py | 2 +- synapse/rest/media/v1/preview_url_resource.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 219b73426850..1b6f7cb79584 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -282,7 +282,7 @@ def get_file(self, url, output_stream, max_size=None): logger.exception("Failed to download body") raise - defer.returnValue((length, headers)) + defer.returnValue((length, headers, response.request.absoluteURI)) # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 04d02ee42796..bae3905a437b 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -18,6 +18,7 @@ from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html +from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -125,7 +126,14 @@ def _async_render_GET(self, request): images = big_images if big_images else images if images: - og['og:image'] = images[0].attrib['src'] + base = list(urlparse(media_info['uri'])) + src = list(urlparse(images[0].attrib['src'])) + if not src[0] and not src[1]: + src[0] = base[0] + src[1] = base[1] + if not src[2].startswith('/'): + src[2] = re.sub(r'/[^/]+$', '/', base[2]) + src[2] + og['og:image'] = urlunparse(src) text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") @@ -140,6 +148,7 @@ def _async_render_GET(self, request): text = text.strip()[:1024] og['og:description'] = text if text else None + # TODO: extract a favicon? # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too # TODO: store our OG details in a cache (and expire them when stale) # TODO: delete the content to stop diskfilling, as we only ever cared about its OG @@ -180,7 +189,7 @@ def _download_url(self, url, user): try: with open(fname, "wb") as f: logger.debug("Trying to get url '%s'" % url) - length, headers = yield self.client.get_file( + length, headers, uri = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) # FIXME: handle 404s sanely - don't spider an error page @@ -233,6 +242,7 @@ def _download_url(self, url, user): "created_ts": time_now_ms, "filesystem_id": file_id, "filename": fname, + "uri": uri, }) def _is_media(self, content_type): From c60b751694bbeb82105eb828d41c0b5c26d5e195 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 1 Apr 2016 02:17:48 +0100 Subject: [PATCH 15/40] fix assorted redirect, unicode and screenscraping bugs --- synapse/rest/media/v1/preview_url_resource.py | 174 ++++++++++-------- 1 file changed, 96 insertions(+), 78 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bae3905a437b..a7ffe593b1a9 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -74,84 +74,93 @@ def _async_render_GET(self, request): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - # XXX: can't work out how to make lxml ignore UTF8 decoding errors - # so slurp as a string at this point. - file = open(media_info['filename']) - body = file.read() - file.close() - # FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8... - tree = html.fromstring(body.decode('utf-8','ignore')) - # tree = html.parse(media_info['filename']) - - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - if not og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text if title else None - - images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] - big_images = [ i for i in images if ( - 'width' in i.attrib and 'height' in i.attrib and - i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] - big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) - images = big_images if big_images else images - - if images: - base = list(urlparse(media_info['uri'])) - src = list(urlparse(images[0].attrib['src'])) - if not src[0] and not src[1]: - src[0] = base[0] - src[1] = base[1] - if not src[2].startswith('/'): - src[2] = re.sub(r'/[^/]+$', '/', base[2]) + src[2] - og['og:image'] = urlunparse(src) - - text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") - text = '' - for text_node in text_nodes: - if len(text) < 1024: - text += text_node + ' ' + def _calc_og(): + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + if 'og:title' not in og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text if title else None + + + if 'og:image' not in og: + meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + if meta_image: + og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:1024] - og['og:description'] = text if text else None - - # TODO: extract a favicon? - # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] + big_images = [ i for i in images if ( + 'width' in i.attrib and 'height' in i.attrib and + i.attrib['width'] > 64 and i.attrib['height'] > 64 + )] + big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = big_images if big_images else images + + if images: + og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + + if 'og:description' not in og: + meta_description = tree.xpath("//*/meta[@name='description']/@content"); + if meta_description: + og['og:description'] = meta_description[0] + else: + text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") + text = '' + for text_node in text_nodes: + if len(text) < 500: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:500] + og['og:description'] = text if text else None + + # TODO: extract a favicon? + # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + return og + + try: + tree = html.parse(media_info['filename']) + og = _calc_og() + except UnicodeDecodeError: + # XXX: evil evil bodge + file = open(media_info['filename']) + body = file.read() + file.close() + tree = html.fromstring(body.decode('utf-8','ignore')) + og = _calc_og() + else: logger.warn("Failed to find any OG data in %s", url) og = {} @@ -173,6 +182,15 @@ def _async_render_GET(self, request): ) raise + def _rebase_url(self, url, base): + base = list(urlparse(base)) + url = list(urlparse(url)) + if not url[0] and not url[1]: + url[0] = base[0] + url[1] = base[1] + if not url[2].startswith('/'): + url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] + return urlunparse(url) @defer.inlineCallbacks def _download_url(self, url, user): @@ -223,7 +241,7 @@ def _download_url(self, url, user): download_name = None yield self.store.store_local_media( - media_id=fname, + media_id=file_id, media_type=media_type, time_now_ms=self.clock.time_msec(), upload_name=download_name, From 5fd07da76473f7a361db4b16b58fc4c21acc4af0 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 00:35:49 +0100 Subject: [PATCH 16/40] refactor calc_og; spider image URLs; fix xpath; add a (broken) expiringcache; loads of other fixes --- synapse/rest/media/v1/preview_url_resource.py | 202 +++++++++++------- 1 file changed, 121 insertions(+), 81 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index a7ffe593b1a9..1273472dabe0 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -20,6 +20,7 @@ from lxml import html from urlparse import urlparse, urlunparse from synapse.util.stringutils import random_string +from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes @@ -36,6 +37,12 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + self.cache = ExpiringCache( + cache_name = "url_previews", + clock = self.clock, + expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour + ) + self.cache.start() def render_GET(self, request): self._async_render_GET(request) @@ -50,6 +57,11 @@ def _async_render_GET(self, request): requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] + if self.cache: + og = self.cache.get(url) + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + return + # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. @@ -74,98 +86,25 @@ def _async_render_GET(self, request): elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - def _calc_og(): - # suck it up into lxml and define our OG response. - # if we see any URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - og[tag.attrib['property']] = tag.attrib['content'] - - if 'og:title' not in og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text if title else None - - - if 'og:image' not in og: - meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); - if meta_image: - og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) - else: - images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ] - big_images = [ i for i in images if ( - 'width' in i.attrib and 'height' in i.attrib and - i.attrib['width'] > 64 and i.attrib['height'] > 64 - )] - big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) - images = big_images if big_images else images - - if images: - og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) - - if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[@name='description']/@content"); - if meta_description: - og['og:description'] = meta_description[0] - else: - text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()") - text = '' - for text_node in text_nodes: - if len(text) < 500: - text += text_node + ' ' - else: - break - text = re.sub(r'[\t ]+', ' ', text) - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) - text = text.strip()[:500] - og['og:description'] = text if text else None - - # TODO: extract a favicon? - # TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too - # TODO: store our OG details in a cache (and expire them when stale) - # TODO: delete the content to stop diskfilling, as we only ever cared about its OG - return og - try: tree = html.parse(media_info['filename']) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) except UnicodeDecodeError: # XXX: evil evil bodge file = open(media_info['filename']) body = file.read() file.close() tree = html.fromstring(body.decode('utf-8','ignore')) - og = _calc_og() + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) og = {} - logger.warn(og) + if self.cache: + self.cache[url] = og + + logger.warn(og); respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: @@ -182,11 +121,112 @@ def _calc_og(): ) raise + @defer.inlineCallbacks + def _calc_og(self, tree, media_info, requester): + # suck our tree into lxml and define our OG response. + + # if we see any image URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" + # "og:site_name" : "Twitter" + + # or: + + # "og:type" : "video", + # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", + # "og:site_name" : "YouTube", + # "og:video:type" : "application/x-shockwave-flash", + # "og:description" : " ", + # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", + # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", + # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:width" : "1280" + # "og:video:height" : "720", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: grab article: meta tags too, e.g.: + + # + # + # + # + # + # + + if 'og:title' not in og: + # do some basic spidering of the HTML + title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") + og['og:title'] = title[0].text.strip() if title else None + + + if 'og:image' not in og: + # TODO: extract a favicon failing all else + meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + if meta_image: + og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) + else: + images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + if not images: + images = tree.xpath("//img[@src]") + if images: + og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + + # pre-cache the image for posterity + if 'og:image' in og and og['og:image']: + image_info = yield self._download_url(og['og:image'], requester.user) + + if self._is_media(image_info['media_type']): + # TODO: make sure we don't choke on white-on-transparent images + dims = yield self._generate_local_thumbnails( + image_info['filesystem_id'], image_info + ) + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) + og["og:image:type"] = image_info['media_type'] + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + del og["og:image"] + + if 'og:description' not in og: + meta_description = tree.xpath("//*/meta[@name='description']/@content"); + if meta_description: + og['og:description'] = meta_description[0] + else: + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + + "ancestor::footer | ancestor::script | ancestor::style)]" + + "[ancestor::body]") + text = '' + for text_node in text_nodes: + if len(text) < 500: + text += text_node + ' ' + else: + break + text = re.sub(r'[\t ]+', ' ', text) + text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) + text = text.strip()[:500] + og['og:description'] = text if text else None + + # TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age } + # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG + defer.returnValue(og); + def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0] and not url[1]: - url[0] = base[0] + if not url[0]: + url[0] = base[0] or "http" + if not url[1]: url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] From b26e8604f168b0f1ecc095bd0d6a717128361a41 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 01:35:44 +0100 Subject: [PATCH 17/40] make meta comparisons case insensitive --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 1273472dabe0..77757548bdd7 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -170,7 +170,7 @@ def _calc_og(self, tree, media_info, requester): if 'og:image' not in og: # TODO: extract a favicon failing all else - meta_image = tree.xpath("//*/meta[@itemprop='image']/@content"); + meta_image = tree.xpath("//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"); if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: @@ -198,7 +198,7 @@ def _calc_og(self, tree, media_info, requester): del og["og:image"] if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[@name='description']/@content"); + meta_description = tree.xpath("//*/meta[translate(@name, 'DESCRIPTION', 'description')='description']/@content"); if meta_description: og['og:description'] = meta_description[0] else: From 5037ee0d37f7e5c7a62f5af5ceef5363701e3202 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:29:57 +0100 Subject: [PATCH 18/40] handle missing dimensions without crashing --- synapse/rest/media/v1/preview_url_resource.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 77757548bdd7..3ffdafce0918 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -78,10 +78,14 @@ def _async_render_GET(self, request): "og:description" : media_info['download_name'], "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), "og:image:type" : media_info['media_type'], - "og:image:width" : dims['width'], - "og:image:height" : dims['height'], } + if dims: + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + logger.warn("Couldn't get dims for %s" % url) + # define our OG response for this media elif self._is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM @@ -174,6 +178,7 @@ def _calc_og(self, tree, media_info, requester): if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: + # TODO: consider inlined CSS styles as well as width & height attribs images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) if not images: @@ -190,10 +195,14 @@ def _calc_og(self, tree, media_info, requester): dims = yield self._generate_local_thumbnails( image_info['filesystem_id'], image_info ) + if dims: + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + logger.warn("Couldn't get dims for %s" % og["og:image"]) + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) og["og:image:type"] = image_info['media_type'] - og["og:image:width"] = dims['width'] - og["og:image:height"] = dims['height'] else: del og["og:image"] From 2c838f6459db35ad9812a83184d85a06ca5d940a Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:30:07 +0100 Subject: [PATCH 19/40] pass back SVGs as their own thumbnails --- synapse/rest/media/v1/thumbnail_resource.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py index ab52499785cd..1e71738bc401 100644 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ b/synapse/rest/media/v1/thumbnail_resource.py @@ -72,6 +72,11 @@ def _respond_local_thumbnail(self, request, media_id, width, height, self._respond_404(request) return + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.local_media_filepath(media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) if thumbnail_infos: @@ -103,6 +108,11 @@ def _select_or_generate_local_thumbnail(self, request, media_id, desired_width, self._respond_404(request) return + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.local_media_filepath(media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_local_media_thumbnails(media_id) for info in thumbnail_infos: t_w = info["thumbnail_width"] == desired_width @@ -138,6 +148,11 @@ def _select_or_generate_remote_thumbnail(self, request, server_name, media_id, desired_method, desired_type): media_info = yield self._get_remote_media(server_name, media_id) + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.remote_media_filepath(server_name, media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_remote_media_thumbnails( server_name, media_id, ) @@ -181,6 +196,11 @@ def _respond_remote_thumbnail(self, request, server_name, media_id, width, # We should proxy the thumbnail from the remote server instead. media_info = yield self._get_remote_media(server_name, media_id) + if media_info["media_type"] == "image/svg+xml": + file_path = self.filepaths.remote_media_filepath(server_name, media_id) + yield self._respond_with_file(request, media_info["media_type"], file_path) + return + thumbnail_infos = yield self.store.get_remote_media_thumbnails( server_name, media_id, ) From 93771579610d723488486f40622d6c99ed061d7f Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 02:31:45 +0100 Subject: [PATCH 20/40] how was _respond_default_thumbnail ever meant to work? --- synapse/rest/media/v1/thumbnail_resource.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/synapse/rest/media/v1/thumbnail_resource.py b/synapse/rest/media/v1/thumbnail_resource.py index 1e71738bc401..513b44568888 100644 --- a/synapse/rest/media/v1/thumbnail_resource.py +++ b/synapse/rest/media/v1/thumbnail_resource.py @@ -228,6 +228,8 @@ def _respond_remote_thumbnail(self, request, server_name, media_id, width, @defer.inlineCallbacks def _respond_default_thumbnail(self, request, media_info, width, height, method, m_type): + # XXX: how is this meant to work? store.get_default_thumbnails + # appears to always return [] so won't this always 404? media_type = media_info["media_type"] top_level_type = media_type.split("/")[0] sub_type = media_type.split("/")[-1].split(";")[0] From d1b154a10fc0f71fb36010f784ca6570f845c8d5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 2 Apr 2016 03:06:39 +0100 Subject: [PATCH 21/40] support gzip compression, and don't pass through error msgs --- synapse/http/client.py | 11 ++++++++--- synapse/rest/media/v1/preview_url_resource.py | 5 +++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 1b6f7cb79584..b21bf17378ea 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,7 +23,8 @@ from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, + BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, + readBody, FileBodyProducer, PartialDownloadError, ) from twisted.web.http import PotentialDataLoss from twisted.web.http_headers import Headers @@ -269,6 +270,10 @@ def get_file(self, url, output_stream, max_size=None): # XXX: do we want to explicitly drop the connection here somehow? if so, how? raise # what should we be raising here? + if response.code > 299: + logger.warn("Got %d when downloading %s" % (response.code, url)) + raise + # TODO: if our Content-Type is HTML or something, just read the first # N bytes into RAM rather than saving it all to disk only to read it # straight back in again @@ -366,11 +371,11 @@ class SpiderHttpClient(SimpleHttpClient): def __init__(self, hs): SimpleHttpClient.__init__(self, hs) # clobber the base class's agent and UA: - self.agent = BrowserLikeRedirectAgent(Agent( + self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - )) + )), [('gzip', GzipDecoder)]) # Look like Chrome for now #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 3ffdafce0918..162e09ba7180 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -200,7 +200,7 @@ def _calc_og(self, tree, media_info, requester): og["og:image:height"] = dims['height'] else: logger.warn("Couldn't get dims for %s" % og["og:image"]) - + og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) og["og:image:type"] = image_info['media_type'] else: @@ -259,7 +259,8 @@ def _download_url(self, url, user): length, headers, uri = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) - # FIXME: handle 404s sanely - don't spider an error page + # FIXME: pass through 404s and other error messages nicely + media_type = headers["Content-Type"][0] time_now_ms = self.clock.time_msec() From 7426c86eb88a7abef9af7ba544ccd709b25e8304 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:31:57 +0100 Subject: [PATCH 22/40] add a persistent cache of URL lookups, and fix up the in-memory one to work --- synapse/http/client.py | 6 +- synapse/rest/media/v1/preview_url_resource.py | 64 ++++++++++++++++--- synapse/storage/media_repository.py | 54 +++++++++++++++- .../30/local_media_repository_url_cache.sql | 27 ++++++++ 4 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 synapse/storage/schema/delta/30/local_media_repository_url_cache.sql diff --git a/synapse/http/client.py b/synapse/http/client.py index b21bf17378ea..f42a36ffa648 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -251,8 +251,8 @@ def get_file(self, url, output_stream, max_size=None): url (str): The URL to GET output_stream (file): File to write the response body to. Returns: - A (int,dict) tuple of the file length and a dict of the response - headers. + A (int,dict,string,int) tuple of the file length, dict of the response + headers, absolute URI of the response and HTTP response code. """ response = yield self.request( @@ -287,7 +287,7 @@ def get_file(self, url, output_stream, max_size=None): logger.exception("Failed to download body") raise - defer.returnValue((length, headers, response.request.absoluteURI)) + defer.returnValue((length, headers, response.request.absoluteURI, response.code)) # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 162e09ba7180..86341cc4ccda 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -37,6 +37,8 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + + # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( cache_name = "url_previews", clock = self.clock, @@ -56,17 +58,41 @@ def _async_render_GET(self, request): # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - - if self.cache: - og = self.cache.get(url) - respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - return + ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() # TODO: keep track of whether there's an ongoing request for this preview # and block and return their details if there is one. + # first check the memory cache - good to handle all the clients on this + # HS thundering away to preview the same URL at the same time. + try: + og = self.cache[url] + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + return + except: + pass + + # then check the URL cache in the DB (which will also provide us with + # historical previews, if we have any) + cache_result = yield self.store.get_url_cache(url, ts) + if ( + cache_result and + cache_result["download_ts"] + cache_result["expires"] > ts and + cache_result["response_code"] / 100 == 2 + ): + respond_with_json_bytes( + request, 200, cache_result["og"].encode('utf-8'), + send_cors=True + ) + return + media_info = yield self._download_url(url, requester.user) + # FIXME: we should probably update our cache now anyway, so that + # even if the OG calculation raises, we don't keep hammering on the + # remote server. For now, leave it uncached to aid debugging OG + # calculation problems + logger.debug("got media_info of '%s'" % media_info) if self._is_media(media_info['media_type']): @@ -105,10 +131,21 @@ def _async_render_GET(self, request): logger.warn("Failed to find any OG data in %s", url) og = {} - if self.cache: - self.cache[url] = og + logger.debug("Calculated OG for %s as %s" % (url, og)); + + # store OG in ephemeral in-memory cache + self.cache[url] = og - logger.warn(og); + # store OG in history-aware DB cache + yield self.store.store_url_cache( + url, + media_info["response_code"], + media_info["etag"], + media_info["expires"], + json.dumps(og), + media_info["filesystem_id"], + media_info["created_ts"], + ) respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: @@ -187,6 +224,9 @@ def _calc_og(self, tree, media_info, requester): og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) # pre-cache the image for posterity + # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself + # and benefit from the same caching etc. But for now we just rely on the caching + # of the master request to speed things up. if 'og:image' in og and og['og:image']: image_info = yield self._download_url(og['og:image'], requester.user) @@ -226,7 +266,6 @@ def _calc_og(self, tree, media_info, requester): text = text.strip()[:500] og['og:description'] = text if text else None - # TODO: persist a cache mapping { url, etag } -> { og, mxc of url (if we bother keeping it around), age } # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG defer.returnValue(og); @@ -256,7 +295,7 @@ def _download_url(self, url, user): try: with open(fname, "wb") as f: logger.debug("Trying to get url '%s'" % url) - length, headers, uri = yield self.client.get_file( + length, headers, uri, code = yield self.client.get_file( url, output_stream=f, max_size=self.max_spider_size, ) # FIXME: pass through 404s and other error messages nicely @@ -311,6 +350,11 @@ def _download_url(self, url, user): "filesystem_id": file_id, "filename": fname, "uri": uri, + "response_code": code, + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + "expires": 60 * 60 * 1000, + "etag": headers["ETag"] if "ETag" in headers else None, }) def _is_media(self, content_type): diff --git a/synapse/storage/media_repository.py b/synapse/storage/media_repository.py index 9d3ba324783b..bb002081ae50 100644 --- a/synapse/storage/media_repository.py +++ b/synapse/storage/media_repository.py @@ -25,7 +25,7 @@ def get_default_thumbnails(self, top_level_type, sub_type): def get_local_media(self, media_id): """Get the metadata for a local piece of media Returns: - None if the meia_id doesn't exist. + None if the media_id doesn't exist. """ return self._simple_select_one( "local_media_repository", @@ -50,6 +50,58 @@ def store_local_media(self, media_id, media_type, time_now_ms, upload_name, desc="store_local_media", ) + def get_url_cache(self, url, ts): + """Get the media_id and ts for a cached URL as of the given timestamp + Returns: + None if the URL isn't cached. + """ + def get_url_cache_txn(txn): + # get the most recently cached result (relative to the given ts) + sql = ( + "SELECT response_code, etag, expires, og, media_id, max(download_ts)" + " FROM local_media_repository_url_cache" + " WHERE url = ? AND download_ts <= ?" + ) + txn.execute(sql, (url, ts)) + row = txn.fetchone() + + if not row[3]: + # ...or if we've requested a timestamp older than the oldest + # copy in the cache, return the oldest copy (if any) + sql = ( + "SELECT response_code, etag, expires, og, media_id, min(download_ts)" + " FROM local_media_repository_url_cache" + " WHERE url = ? AND download_ts > ?" + ) + txn.execute(sql, (url, ts)) + row = txn.fetchone() + + if not row[3]: + return None + + return dict(zip(( + 'response_code', 'etag', 'expires', 'og', 'media_id', 'download_ts' + ), row)) + + return self.runInteraction( + "get_url_cache", get_url_cache_txn + ) + + def store_url_cache(self, url, response_code, etag, expires, og, media_id, download_ts): + return self._simple_insert( + "local_media_repository_url_cache", + { + "url": url, + "response_code": response_code, + "etag": etag, + "expires": expires, + "og": og, + "media_id": media_id, + "download_ts": download_ts, + }, + desc="store_url_cache", + ) + def get_local_media_thumbnails(self, media_id): return self._simple_select_list( "local_media_repository_thumbnails", diff --git a/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql b/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql new file mode 100644 index 000000000000..9efb4280ebec --- /dev/null +++ b/synapse/storage/schema/delta/30/local_media_repository_url_cache.sql @@ -0,0 +1,27 @@ +/* Copyright 2016 OpenMarket Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CREATE TABLE local_media_repository_url_cache( + url TEXT, -- the URL being cached + response_code INTEGER, -- the HTTP response code of this download attempt + etag TEXT, -- the etag header of this response + expires INTEGER, -- the number of ms this response was valid for + og TEXT, -- cache of the OG metadata of this URL as JSON + media_id TEXT, -- the media_id, if any, of the URL's content in the repo + download_ts BIGINT -- the timestamp of this download attempt +); + +CREATE INDEX local_media_repository_url_cache_by_url_download_ts + ON local_media_repository_url_cache(url, download_ts); From b09e29a03ca95c577215acbe8d5037d6337e1af3 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:47:40 +0100 Subject: [PATCH 23/40] Ensure only one download for a given URL is active at a time --- synapse/rest/media/v1/preview_url_resource.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 86341cc4ccda..c20de57991df 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -23,6 +23,7 @@ from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes +from synapse.util.async import ObservableDeferred import os import re @@ -46,6 +47,8 @@ def __init__(self, hs, filepaths): ) self.cache.start() + self.downloads = {} + def render_GET(self, request): self._async_render_GET(request) return NOT_DONE_YET @@ -86,7 +89,21 @@ def _async_render_GET(self, request): ) return - media_info = yield self._download_url(url, requester.user) + # Ensure only one download for a given URL is active at a time + download = self.downloads.get(url) + if download is None: + download = self._download_url(url, requester.user) + download = ObservableDeferred( + download, + consumeErrors=True + ) + self.downloads[url] = download + + @download.addBoth + def callback(media_info): + del self.downloads[key] + return media_info + media_info = yield download.observe() # FIXME: we should probably update our cache now anyway, so that # even if the OG calculation raises, we don't keep hammering on the From 110780b18b029c5b6f1c34f7b4e027b88ea8b8ce Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 00:48:31 +0100 Subject: [PATCH 24/40] remove stale todo --- synapse/rest/media/v1/preview_url_resource.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c20de57991df..582dd20fa6b4 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -63,9 +63,6 @@ def _async_render_GET(self, request): url = request.args.get("url")[0] ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() - # TODO: keep track of whether there's an ongoing request for this preview - # and block and return their details if there is one. - # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. try: From c3916462f68df84df29ad924c07f8e83c0143fcc Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 01:33:12 +0100 Subject: [PATCH 25/40] rebase all image URLs --- synapse/rest/media/v1/preview_url_resource.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 582dd20fa6b4..31ce2b5831b4 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -235,14 +235,14 @@ def _calc_og(self, tree, media_info, requester): if not images: images = tree.xpath("//img[@src]") if images: - og['og:image'] = self._rebase_url(images[0].attrib['src'], media_info['uri']) + og['og:image'] = images[0].attrib['src'] # pre-cache the image for posterity # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself # and benefit from the same caching etc. But for now we just rely on the caching # of the master request to speed things up. if 'og:image' in og and og['og:image']: - image_info = yield self._download_url(og['og:image'], requester.user) + image_info = yield self._download_url(self._rebase_url(og['og:image'], media_info['uri']), requester.user) if self._is_media(image_info['media_type']): # TODO: make sure we don't choke on white-on-transparent images @@ -286,9 +286,9 @@ def _calc_og(self, tree, media_info, requester): def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0]: + if not url[0]: # fix up schema url[0] = base[0] or "http" - if not url[1]: + if not url[1]: # fix up hostname url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] From eab4d462f8e5d17c5ca7592d1ea15d8e4771a00c Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 02:02:46 +0100 Subject: [PATCH 26/40] fix etag typing error. fix timestamp typing error --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 31ce2b5831b4..7c69c01a6c40 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -61,7 +61,7 @@ def _async_render_GET(self, request): # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - ts = request.args.get("ts")[0] if "ts" in request.args else self.clock.time_msec() + ts = int(request.args.get("ts")[0]) if "ts" in request.args else self.clock.time_msec() # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. @@ -368,7 +368,7 @@ def _download_url(self, url, user): # FIXME: we should calculate a proper expiration based on the # Cache-Control and Expire headers. But for now, assume 1 hour. "expires": 60 * 60 * 1000, - "etag": headers["ETag"] if "ETag" in headers else None, + "etag": headers["ETag"][0] if "ETag" in headers else None, }) def _is_media(self, content_type): From 8b98a7e8c37f0fae09f33a6d93953584288ed394 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 12:56:29 +0100 Subject: [PATCH 27/40] pep8 --- synapse/http/client.py | 14 +- synapse/rest/media/v1/media_repository.py | 1 - synapse/rest/media/v1/preview_url_resource.py | 127 ++++++++++-------- synapse/storage/media_repository.py | 3 +- 4 files changed, 85 insertions(+), 60 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index f42a36ffa648..442b4bb73ddf 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -15,7 +15,9 @@ from OpenSSL import SSL from OpenSSL.SSL import VERIFY_NONE -from synapse.api.errors import CodeMessageException +from synapse.api.errors import ( + CodeMessageException, SynapseError, Codes, +) from synapse.util.logcontext import preserve_context_over_fn import synapse.metrics @@ -268,7 +270,7 @@ def get_file(self, url, output_stream, max_size=None): if 'Content-Length' in headers and headers['Content-Length'] > max_size: logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) # XXX: do we want to explicitly drop the connection here somehow? if so, how? - raise # what should we be raising here? + raise # what should we be raising here? if response.code > 299: logger.warn("Got %d when downloading %s" % (response.code, url)) @@ -331,6 +333,7 @@ def _readBodyToFile(response, stream, max_size): response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size)) return d + class CaptchaServerHttpClient(SimpleHttpClient): """ Separate HTTP client for talking to google's captcha servers @@ -360,6 +363,7 @@ def post_urlencoded_get_raw(self, url, args={}): # twisted dislikes google's response, no content length. defer.returnValue(e.response) + class SpiderHttpClient(SimpleHttpClient): """ Separate HTTP client for spidering arbitrary URLs. @@ -376,8 +380,10 @@ def __init__(self, hs): connectTimeout=15, contextFactory=hs.get_http_client_context_factory() )), [('gzip', GzipDecoder)]) - # Look like Chrome for now - #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) + # We could look like Chrome: + # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) + # Chrome Safari" % hs.version_string) + def encode_urlencode_args(args): return {k: encode_urlencode_arg(v) for k, v in args.items()} diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 8f3491b91c66..11f672aeabc5 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -80,4 +80,3 @@ def __init__(self, hs): self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) - diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7c69c01a6c40..29db5c7fce0f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -13,25 +13,31 @@ # limitations under the License. from .base_resource import BaseMediaResource -from synapse.api.errors import Codes -from twisted.web.resource import Resource + from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html from urlparse import urlparse, urlunparse + +from synapse.api.errors import Codes from synapse.util.stringutils import random_string from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient -from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes +from synapse.http.server import ( + request_handler, respond_with_json, respond_with_json_bytes +) from synapse.util.async import ObservableDeferred +from synapse.util.stringutils import is_ascii import os import re +import cgi import ujson as json import logging logger = logging.getLogger(__name__) + class PreviewUrlResource(BaseMediaResource): isLeaf = True @@ -41,9 +47,10 @@ def __init__(self, hs, filepaths): # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( - cache_name = "url_previews", - clock = self.clock, - expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour + cache_name="url_previews", + clock=self.clock, + # don't spider URLs more often than once an hour + expiry_ms=60 * 60 * 1000, ) self.cache.start() @@ -56,12 +63,15 @@ def render_GET(self, request): @request_handler @defer.inlineCallbacks def _async_render_GET(self, request): - + try: # XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0] - ts = int(request.args.get("ts")[0]) if "ts" in request.args else self.clock.time_msec() + if "ts" in request.args: + ts = int(request.args.get("ts")[0]) + else: + ts = self.clock.time_msec() # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. @@ -98,7 +108,7 @@ def _async_render_GET(self, request): @download.addBoth def callback(media_info): - del self.downloads[key] + del self.downloads[url] return media_info media_info = yield download.observe() @@ -111,13 +121,15 @@ def callback(media_info): if self._is_media(media_info['media_type']): dims = yield self._generate_local_thumbnails( - media_info['filesystem_id'], media_info - ) + media_info['filesystem_id'], media_info + ) og = { - "og:description" : media_info['download_name'], - "og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), - "og:image:type" : media_info['media_type'], + "og:description": media_info['download_name'], + "og:image": "mxc://%s/%s" % ( + self.server_name, media_info['filesystem_id'] + ), + "og:image:type": media_info['media_type'], } if dims: @@ -138,14 +150,14 @@ def callback(media_info): file = open(media_info['filename']) body = file.read() file.close() - tree = html.fromstring(body.decode('utf-8','ignore')) + tree = html.fromstring(body.decode('utf-8', 'ignore')) og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url) og = {} - logger.debug("Calculated OG for %s as %s" % (url, og)); + logger.debug("Calculated OG for %s as %s" % (url, og)) # store OG in ephemeral in-memory cache self.cache[url] = og @@ -181,28 +193,20 @@ def _calc_og(self, tree, media_info, requester): # suck our tree into lxml and define our OG response. # if we see any image URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) - - # "og:type" : "article" - # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" - # "og:title" : "Matrix on Twitter" - # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" - # "og:description" : "Synapse 0.12 is out! Lots of polishing, performance &amp; bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP" - # "og:site_name" : "Twitter" - - # or: + # (although the client could choose to do this by asking for previews of those + # URLs to avoid DoSing the server) # "og:type" : "video", # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", # "og:site_name" : "YouTube", # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : " ", + # "og:description" : "Fun stuff happening here", # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", # "og:video:width" : "1280" # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", + # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", og = {} for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): @@ -210,64 +214,76 @@ def _calc_og(self, tree, media_info, requester): # TODO: grab article: meta tags too, e.g.: - # - # - # - # - # - # + # "article:publisher" : "https://www.facebook.com/thethudonline" /> + # "article:author" content="https://www.facebook.com/thethudonline" /> + # "article:tag" content="baby" /> + # "article:section" content="Breaking News" /> + # "article:published_time" content="2016-03-31T19:58:24+00:00" /> + # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> if 'og:title' not in og: # do some basic spidering of the HTML title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") og['og:title'] = title[0].text.strip() if title else None - if 'og:image' not in og: # TODO: extract a favicon failing all else - meta_image = tree.xpath("//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"); + meta_image = tree.xpath( + "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content" + ) if meta_image: og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) else: # TODO: consider inlined CSS styles as well as width & height attribs images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") - images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) + images = sorted(images, key=lambda i: ( + -1 * int(i.attrib['width']) * int(i.attrib['height']) + )) if not images: images = tree.xpath("//img[@src]") if images: og['og:image'] = images[0].attrib['src'] # pre-cache the image for posterity - # FIXME: it might be cleaner to use the same flow as the main /preview_url request itself - # and benefit from the same caching etc. But for now we just rely on the caching - # of the master request to speed things up. + # FIXME: it might be cleaner to use the same flow as the main /preview_url request + # itself and benefit from the same caching etc. But for now we just rely on the + # caching on the master request to speed things up. if 'og:image' in og and og['og:image']: - image_info = yield self._download_url(self._rebase_url(og['og:image'], media_info['uri']), requester.user) + image_info = yield self._download_url( + self._rebase_url(og['og:image'], media_info['uri']), requester.user + ) if self._is_media(image_info['media_type']): # TODO: make sure we don't choke on white-on-transparent images dims = yield self._generate_local_thumbnails( - image_info['filesystem_id'], image_info - ) + image_info['filesystem_id'], image_info + ) if dims: og["og:image:width"] = dims['width'] og["og:image:height"] = dims['height'] else: logger.warn("Couldn't get dims for %s" % og["og:image"]) - og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) + og["og:image"] = "mxc://%s/%s" % ( + self.server_name, image_info['filesystem_id'] + ) og["og:image:type"] = image_info['media_type'] else: del og["og:image"] if 'og:description' not in og: - meta_description = tree.xpath("//*/meta[translate(@name, 'DESCRIPTION', 'description')='description']/@content"); + meta_description = tree.xpath( + "//*/meta" + "[translate(@name, 'DESCRIPTION', 'description')='description']" + "/@content") if meta_description: og['og:description'] = meta_description[0] else: - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") - text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + - "ancestor::footer | ancestor::script | ancestor::style)]" + + # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " + # "//p/text() | //div/text() | //span/text() | //a/text()") + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " + "ancestor::aside | ancestor::footer | " + "ancestor::script | ancestor::style)]" + "[ancestor::body]") text = '' for text_node in text_nodes: @@ -280,15 +296,16 @@ def _calc_og(self, tree, media_info, requester): text = text.strip()[:500] og['og:description'] = text if text else None - # TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG - defer.returnValue(og); + # TODO: delete the url downloads to stop diskfilling, + # as we only ever cared about its OG + defer.returnValue(og) def _rebase_url(self, url, base): base = list(urlparse(base)) url = list(urlparse(url)) - if not url[0]: # fix up schema + if not url[0]: # fix up schema url[0] = base[0] or "http" - if not url[1]: # fix up hostname + if not url[1]: # fix up hostname url[1] = base[1] if not url[2].startswith('/'): url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2] @@ -377,6 +394,8 @@ def _is_media(self, content_type): def _is_html(self, content_type): content_type = content_type.lower() - if (content_type.startswith("text/html") or - content_type.startswith("application/xhtml")): + if ( + content_type.startswith("text/html") or + content_type.startswith("application/xhtml") + ): return True diff --git a/synapse/storage/media_repository.py b/synapse/storage/media_repository.py index bb002081ae50..c9dd20eed840 100644 --- a/synapse/storage/media_repository.py +++ b/synapse/storage/media_repository.py @@ -87,7 +87,8 @@ def get_url_cache_txn(txn): "get_url_cache", get_url_cache_txn ) - def store_url_cache(self, url, response_code, etag, expires, og, media_id, download_ts): + def store_url_cache(self, url, response_code, etag, expires, og, media_id, + download_ts): return self._simple_insert( "local_media_repository_url_cache", { From 0834b152fb05e110428a4834a2e5dc51b6f7d327 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 12:59:27 +0100 Subject: [PATCH 28/40] char encoding --- synapse/rest/media/v1/preview_url_resource.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 29db5c7fce0f..ff522c5fb87a 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2016 OpenMarket Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); From cf51c4120e79a59a798fcf88c5c7d9f95dc6e76d Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 3 Apr 2016 23:57:05 +0100 Subject: [PATCH 29/40] report image size (bytewise) in OG meta --- synapse/rest/media/v1/preview_url_resource.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index ff522c5fb87a..f5ec32d8f2a4 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -131,6 +131,7 @@ def callback(media_info): self.server_name, media_info['filesystem_id'] ), "og:image:type": media_info['media_type'], + "matrix:image:size": media_info['media_length'], } if dims: @@ -269,6 +270,7 @@ def _calc_og(self, tree, media_info, requester): self.server_name, image_info['filesystem_id'] ) og["og:image:type"] = image_info['media_type'] + og["matrix:image:size"] = image_info['media_length'] else: del og["og:image"] From dafef5a688b8684232346a26a789a2da600ec58e Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 8 Apr 2016 18:37:15 +0100 Subject: [PATCH 30/40] Add url_preview_enabled config option to turn on/off preview_url endpoint. defaults to off. Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered. Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered. Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist Add commentary and generally address PR feedback --- synapse/config/repository.py | 77 ++++++++++++++++++- synapse/http/client.py | 44 +++++++++-- synapse/http/endpoint.py | 35 ++++++++- synapse/python_dependencies.py | 7 +- synapse/rest/media/v1/media_repository.py | 7 +- synapse/rest/media/v1/preview_url_resource.py | 75 ++++++++++++++---- 6 files changed, 214 insertions(+), 31 deletions(-) diff --git a/synapse/config/repository.py b/synapse/config/repository.py index f4ab705701b0..da1007d76782 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -16,6 +16,8 @@ from ._base import Config from collections import namedtuple +import sys + ThumbnailRequirement = namedtuple( "ThumbnailRequirement", ["width", "height", "method", "media_type"] ) @@ -23,7 +25,7 @@ def parse_thumbnail_requirements(thumbnail_sizes): """ Takes a list of dictionaries with "width", "height", and "method" keys - and creates a map from image media types to the thumbnail size, thumnailing + and creates a map from image media types to the thumbnail size, thumbnailing method, and thumbnail media type to precalculate Args: @@ -60,6 +62,18 @@ def read_config(self, config): self.thumbnail_requirements = parse_thumbnail_requirements( config["thumbnail_sizes"] ) + self.url_preview_enabled = config["url_preview_enabled"] + if self.url_preview_enabled: + try: + from netaddr import IPSet + if "url_preview_ip_range_blacklist" in config: + self.url_preview_ip_range_blacklist = IPSet( + config["url_preview_ip_range_blacklist"] + ) + if "url_preview_url_blacklist" in config: + self.url_preview_url_blacklist = config["url_preview_url_blacklist"] + except ImportError: + sys.stderr.write("\nmissing netaddr dep - disabling preview_url API\n") def default_config(self, **kwargs): media_store = self.default_path("media_store") @@ -74,9 +88,6 @@ def default_config(self, **kwargs): # The largest allowed upload size in bytes max_upload_size: "10M" - # The largest allowed URL preview spidering size in bytes - max_spider_size: "10M" - # Maximum number of pixels that will be thumbnailed max_image_pixels: "32M" @@ -104,4 +115,62 @@ def default_config(self, **kwargs): - width: 800 height: 600 method: scale + + # Is the preview URL API enabled? If enabled, you *must* specify + # an explicit url_preview_ip_range_blacklist of IPs that the spider is + # denied from accessing. + url_preview_enabled: False + + # List of IP address CIDR ranges that the URL preview spider is denied + # from accessing. There are no defaults: you must explicitly + # specify a list for URL previewing to work. You should specify any + # internal services in your network that you do not want synapse to try + # to connect to, otherwise anyone in any Matrix room could cause your + # synapse to issue arbitrary GET requests to your internal services, + # causing serious security issues. + # + # url_preview_ip_range_blacklist: + # - '127.0.0.0/8' + # - '10.0.0.0/8' + # - '172.16.0.0/12' + # - '192.168.0.0/16' + + # Optional list of URL matches that the URL preview spider is + # denied from accessing. You should use url_preview_ip_range_blacklist + # in preference to this, otherwise someone could define a public DNS + # entry that points to a private IP address and circumvent the blacklist. + # This is more useful if you know there is an entire shape of URL that + # you know that will never want synapse to try to spider. + # + # Each list entry is a dictionary of url component attributes as returned + # by urlparse.urlsplit as applied to the absolute form of the URL. See + # https://docs.python.org/2/library/urlparse.html#urlparse.urlsplit + # The values of the dictionary are treated as an filename match pattern + # applied to that component of URLs, unless they start with a ^ in which + # case they are treated as a regular expression match. If all the + # specified component matches for a given list item succeed, the URL is + # blacklisted. + # + # url_preview_url_blacklist: + # # blacklist any URL with a username in its URI + # - username: '*'' + # + # # blacklist all *.google.com URLs + # - netloc: 'google.com' + # - netloc: '*.google.com' + # + # # blacklist all plain HTTP URLs + # - scheme: 'http' + # + # # blacklist http(s)://www.acme.com/foo + # - netloc: 'www.acme.com' + # path: '/foo' + # + # # blacklist any URL with a literal IPv4 address + # - netloc: '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' + + # The largest allowed URL preview spidering size in bytes + max_spider_size: "10M" + + """ % locals() diff --git a/synapse/http/client.py b/synapse/http/client.py index 442b4bb73ddf..3b8ffcd3ef8c 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -20,10 +20,12 @@ ) from synapse.util.logcontext import preserve_context_over_fn import synapse.metrics +from synapse.http.endpoint import SpiderEndpoint from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol +from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint from twisted.web.client import ( BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, readBody, FileBodyProducer, PartialDownloadError, @@ -364,6 +366,35 @@ def post_urlencoded_get_raw(self, url, args={}): defer.returnValue(e.response) +class SpiderEndpointFactory(object): + def __init__(self, hs): + self.blacklist = hs.config.url_preview_ip_range_blacklist + self.policyForHTTPS = hs.get_http_client_context_factory() + + def endpointForURI(self, uri): + logger.info("Getting endpoint for %s", uri.toBytes()) + if uri.scheme == "http": + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=TCP4ClientEndpoint, + endpoint_kw_args={ + 'timeout': 15 + }, + ) + elif uri.scheme == "https": + tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port) + return SpiderEndpoint( + reactor, uri.host, uri.port, self.blacklist, + endpoint=SSL4ClientEndpoint, + endpoint_kw_args={ + 'sslContextFactory': tlsPolicy, + 'timeout': 15 + }, + ) + else: + logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme) + + class SpiderHttpClient(SimpleHttpClient): """ Separate HTTP client for spidering arbitrary URLs. @@ -375,11 +406,14 @@ class SpiderHttpClient(SimpleHttpClient): def __init__(self, hs): SimpleHttpClient.__init__(self, hs) # clobber the base class's agent and UA: - self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( - reactor, - connectTimeout=15, - contextFactory=hs.get_http_client_context_factory() - )), [('gzip', GzipDecoder)]) + self.agent = ContentDecoderAgent( + BrowserLikeRedirectAgent( + Agent.usingEndpointFactory( + reactor, + SpiderEndpointFactory(hs) + ) + ), [('gzip', GzipDecoder)] + ) # We could look like Chrome: # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) # Chrome Safari" % hs.version_string) diff --git a/synapse/http/endpoint.py b/synapse/http/endpoint.py index 4775f6707d3e..de5c762f500d 100644 --- a/synapse/http/endpoint.py +++ b/synapse/http/endpoint.py @@ -74,6 +74,37 @@ def matrix_federation_endpoint(reactor, destination, ssl_context_factory=None, return transport_endpoint(reactor, domain, port, **endpoint_kw_args) +class SpiderEndpoint(object): + """An endpoint which refuses to connect to blacklisted IP addresses + Implements twisted.internet.interfaces.IStreamClientEndpoint. + """ + def __init__(self, reactor, host, port, blacklist, + endpoint=TCP4ClientEndpoint, endpoint_kw_args={}): + self.reactor = reactor + self.host = host + self.port = port + self.blacklist = blacklist + self.endpoint = endpoint + self.endpoint_kw_args = endpoint_kw_args + + @defer.inlineCallbacks + def connect(self, protocolFactory): + address = yield self.reactor.resolve(self.host) + + from netaddr import IPAddress + if IPAddress(address) in self.blacklist: + raise ConnectError( + "Refusing to spider blacklisted IP address %s" % address + ) + + logger.info("Connecting to %s:%s", address, self.port) + endpoint = self.endpoint( + self.reactor, address, self.port, **self.endpoint_kw_args + ) + connection = yield endpoint.connect(protocolFactory) + defer.returnValue(connection) + + class SRVClientEndpoint(object): """An endpoint which looks up SRV records for a service. Cycles through the list of servers starting with each call to connect @@ -118,7 +149,7 @@ def pick_server(self): return self.default_server else: raise ConnectError( - "Not server available for %s", self.service_name + "Not server available for %s" % self.service_name ) min_priority = self.servers[0].priority @@ -166,7 +197,7 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE): and answers[0].type == dns.SRV and answers[0].payload and answers[0].payload.target == dns.Name('.')): - raise ConnectError("Service %s unavailable", service_name) + raise ConnectError("Service %s unavailable" % service_name) for answer in answers: if answer.type != dns.SRV or not answer.payload: diff --git a/synapse/python_dependencies.py b/synapse/python_dependencies.py index 86b833176097..1adbdd942125 100644 --- a/synapse/python_dependencies.py +++ b/synapse/python_dependencies.py @@ -36,13 +36,16 @@ "blist": ["blist"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pymacaroons-pynacl": ["pymacaroons"], - "lxml>=3.6.0": ["lxml"], "pyjwt": ["jwt"], } CONDITIONAL_REQUIREMENTS = { "web_client": { "matrix_angular_sdk>=0.6.8": ["syweb>=0.6.8"], - } + }, + "preview_url": { + "lxml>=3.6.0": ["lxml"], + "netaddr>=0.7.18": ["netaddr"], + }, } diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 11f672aeabc5..97b7e84af932 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -79,4 +79,9 @@ def __init__(self, hs): self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) - self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + if hs.config.url_preview_enabled: + try: + self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + except Exception as e: + logger.warn("Failed to mount preview_url") + logger.exception(e) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f5ec32d8f2a4..faa88deb6e17 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -17,34 +17,52 @@ from twisted.web.server import NOT_DONE_YET from twisted.internet import defer -from lxml import html -from urlparse import urlparse, urlunparse +from urlparse import urlparse, urlsplit, urlunparse -from synapse.api.errors import Codes from synapse.util.stringutils import random_string from synapse.util.caches.expiringcache import ExpiringCache from synapse.http.client import SpiderHttpClient from synapse.http.server import ( - request_handler, respond_with_json, respond_with_json_bytes + request_handler, respond_with_json_bytes ) from synapse.util.async import ObservableDeferred from synapse.util.stringutils import is_ascii import os import re +import fnmatch import cgi import ujson as json import logging logger = logging.getLogger(__name__) +try: + from lxml import html +except ImportError: + pass + class PreviewUrlResource(BaseMediaResource): isLeaf = True def __init__(self, hs, filepaths): + if not html: + logger.warn("Disabling PreviewUrlResource as lxml not available") + raise + + if not hasattr(hs.config, "url_preview_ip_range_blacklist"): + logger.warn( + "For security, you must specify an explicit target IP address " + "blacklist in url_preview_ip_range_blacklist for url previewing " + "to work" + ) + raise + BaseMediaResource.__init__(self, hs, filepaths) self.client = SpiderHttpClient(hs) + if hasattr(hs.config, "url_preview_url_blacklist"): + self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist # simple memory cache mapping urls to OG metadata self.cache = ExpiringCache( @@ -74,6 +92,36 @@ def _async_render_GET(self, request): else: ts = self.clock.time_msec() + # impose the URL pattern blacklist + if hasattr(self, "url_preview_url_blacklist"): + url_tuple = urlsplit(url) + for entry in self.url_preview_url_blacklist: + match = True + for attrib in entry: + pattern = entry[attrib] + value = getattr(url_tuple, attrib) + logger.debug("Matching attrib '%s' with value '%s' against pattern '%s'" % ( + attrib, value, pattern + )) + + if value is None: + match = False + continue + + if pattern.startswith('^'): + if not re.match(pattern, getattr(url_tuple, attrib)): + match = False + continue + else: + if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern): + match = False + continue + if match: + logger.warn( + "URL %s blocked by url_blacklist entry %s", url, entry + ) + raise + # first check the memory cache - good to handle all the clients on this # HS thundering away to preview the same URL at the same time. try: @@ -177,17 +225,6 @@ def callback(media_info): respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) except: - # XXX: if we don't explicitly respond here, the request never returns. - # isn't this what server.py's wrapper is meant to be doing for us? - respond_with_json( - request, - 500, - { - "error": "Internal server error", - "errcode": Codes.UNKNOWN, - }, - send_cors=True - ) raise @defer.inlineCallbacks @@ -282,8 +319,12 @@ def _calc_og(self, tree, media_info, requester): if meta_description: og['og:description'] = meta_description[0] else: - # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " - # "//p/text() | //div/text() | //span/text() | //a/text()") + # grab any text nodes which are inside the tag... + # unless they are within an HTML5 semantic markup tag... + #
,