From 47efba359ef8c53be33837b3760c64e46e40530d Mon Sep 17 00:00:00 2001 From: Mathieu Velten Date: Mon, 24 Jun 2024 22:57:44 +0200 Subject: [PATCH] Remove check of matching claimed and detected MIME type It is not necessary since we already check allowed MIME types against the detected type and not the claimed one. It leads to quite a bunch of false positives since a lot of clients have trouble properly detecting the type. --- src/matrix_content_scanner/scanner/scanner.py | 39 +++---------------- tests/scanner/test_scanner.py | 13 ------- 2 files changed, 5 insertions(+), 47 deletions(-) diff --git a/src/matrix_content_scanner/scanner/scanner.py b/src/matrix_content_scanner/scanner/scanner.py index e9f1cf0..f24c245 100644 --- a/src/matrix_content_scanner/scanner/scanner.py +++ b/src/matrix_content_scanner/scanner/scanner.py @@ -332,13 +332,8 @@ async def _scan_media( # If the file is encrypted, we need to decrypt it before we can scan it. media_content = self._decrypt_file(media_content, metadata) - # Check the file's MIME type to see if it's allowed and, if the file is not - # encrypted, if it matches the Content-Type header the homeserver sent us. - self._check_mimetype( - media_content=media_content, - claimed_mimetype=media.content_type, - encrypted=metadata is not None, - ) + # Check the file's MIME type to see if it's allowed. + self._check_mimetype(media_content) # Write the file to disk. file_path = self._write_file_to_disk(media_path, media_content) @@ -498,42 +493,18 @@ async def _run_scan(self, file_name: str) -> int: return retcode - def _check_mimetype( - self, - media_content: bytes, - claimed_mimetype: str, - encrypted: bool, - ) -> None: - """Detects the MIME type of the provided bytes, and checks that: - * it matches with the Content-Type header that was received when downloading this - file (if the media isn't encrypted, since otherwise the Content-Type header - is always 'application/octet-stream') - * files with this MIME type are allowed (if an allow list is provided in the - configuration) + def _check_mimetype(self,media_content: bytes) -> None: + """Detects the MIME type of the provided bytes, and checks that this type is allowed + (if an allow list is provided in the configuration) Args: media_content: The file's content. If the file is encrypted, this is its decrypted content. - claimed_mimetype: The value of the Content-Type header received when - downloading the file. - encrypted: Whether the file was encrypted (in which case we don't want to - check that its MIME type matches with the Content-Type header). Raises: FileDirtyError if one of the checks fail. """ detected_mimetype = magic.mimetype(media_content) logger.debug("Detected MIME type for file is %s", detected_mimetype) - # Check if the MIME type is matching the one that's expected, but only if the file - # is not encrypted (because otherwise we'll always have 'application/octet-stream' - # in the Content-Type header regardless of the actual MIME type of the file). - if encrypted is False and detected_mimetype != claimed_mimetype: - logger.error( - "Mismatching MIME type (%s) and Content-Type header (%s)", - detected_mimetype, - claimed_mimetype, - ) - raise FileDirtyError("File type not supported") - # If there's an allow list for MIME types, check that the MIME type that's been # detected for this file is in it. if ( diff --git a/tests/scanner/test_scanner.py b/tests/scanner/test_scanner.py index 4a9c7a5..1aabd15 100644 --- a/tests/scanner/test_scanner.py +++ b/tests/scanner/test_scanner.py @@ -247,19 +247,6 @@ async def test_mimetype_encrypted(self) -> None: with self.assertRaises(FileDirtyError): await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) - async def test_mimetype_content_type_mismatch(self) -> None: - """Tests that a scan fails if the detected MIME type does not match the value of - the Content-Type header sent by the homeserver. - """ - # Set up the file description to make it look as if the homeserver tried to tell - # us the file is a JPEG (even though it's actually a PNG). - self.downloader_res.content_type = "image/jpeg" - - # Check that the scan fails since the file's detected MIME type doesn't match the - # value of the Content-Type header. - with self.assertRaises(FileDirtyError): - await self.scanner.scan_file(MEDIA_PATH) - async def test_dont_cache_exit_codes(self) -> None: """Tests that if the configuration specifies exit codes to ignore when running the scanning script, we don't cache them.