diff --git a/src/matrix_content_scanner/scanner/scanner.py b/src/matrix_content_scanner/scanner/scanner.py index e9f1cf0..f24c245 100644 --- a/src/matrix_content_scanner/scanner/scanner.py +++ b/src/matrix_content_scanner/scanner/scanner.py @@ -332,13 +332,8 @@ async def _scan_media( # If the file is encrypted, we need to decrypt it before we can scan it. media_content = self._decrypt_file(media_content, metadata) - # Check the file's MIME type to see if it's allowed and, if the file is not - # encrypted, if it matches the Content-Type header the homeserver sent us. - self._check_mimetype( - media_content=media_content, - claimed_mimetype=media.content_type, - encrypted=metadata is not None, - ) + # Check the file's MIME type to see if it's allowed. + self._check_mimetype(media_content) # Write the file to disk. file_path = self._write_file_to_disk(media_path, media_content) @@ -498,42 +493,18 @@ async def _run_scan(self, file_name: str) -> int: return retcode - def _check_mimetype( - self, - media_content: bytes, - claimed_mimetype: str, - encrypted: bool, - ) -> None: - """Detects the MIME type of the provided bytes, and checks that: - * it matches with the Content-Type header that was received when downloading this - file (if the media isn't encrypted, since otherwise the Content-Type header - is always 'application/octet-stream') - * files with this MIME type are allowed (if an allow list is provided in the - configuration) + def _check_mimetype(self,media_content: bytes) -> None: + """Detects the MIME type of the provided bytes, and checks that this type is allowed + (if an allow list is provided in the configuration) Args: media_content: The file's content. If the file is encrypted, this is its decrypted content. - claimed_mimetype: The value of the Content-Type header received when - downloading the file. - encrypted: Whether the file was encrypted (in which case we don't want to - check that its MIME type matches with the Content-Type header). Raises: FileDirtyError if one of the checks fail. """ detected_mimetype = magic.mimetype(media_content) logger.debug("Detected MIME type for file is %s", detected_mimetype) - # Check if the MIME type is matching the one that's expected, but only if the file - # is not encrypted (because otherwise we'll always have 'application/octet-stream' - # in the Content-Type header regardless of the actual MIME type of the file). - if encrypted is False and detected_mimetype != claimed_mimetype: - logger.error( - "Mismatching MIME type (%s) and Content-Type header (%s)", - detected_mimetype, - claimed_mimetype, - ) - raise FileDirtyError("File type not supported") - # If there's an allow list for MIME types, check that the MIME type that's been # detected for this file is in it. if ( diff --git a/tests/scanner/test_scanner.py b/tests/scanner/test_scanner.py index 4a9c7a5..1aabd15 100644 --- a/tests/scanner/test_scanner.py +++ b/tests/scanner/test_scanner.py @@ -247,19 +247,6 @@ async def test_mimetype_encrypted(self) -> None: with self.assertRaises(FileDirtyError): await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) - async def test_mimetype_content_type_mismatch(self) -> None: - """Tests that a scan fails if the detected MIME type does not match the value of - the Content-Type header sent by the homeserver. - """ - # Set up the file description to make it look as if the homeserver tried to tell - # us the file is a JPEG (even though it's actually a PNG). - self.downloader_res.content_type = "image/jpeg" - - # Check that the scan fails since the file's detected MIME type doesn't match the - # value of the Content-Type header. - with self.assertRaises(FileDirtyError): - await self.scanner.scan_file(MEDIA_PATH) - async def test_dont_cache_exit_codes(self) -> None: """Tests that if the configuration specifies exit codes to ignore when running the scanning script, we don't cache them.