Skip to content

Commit

Permalink
🐛 [#4795] Invert validation for .msg files
Browse files Browse the repository at this point in the history
The SDK cannot reliably determine which content type belongs to a .msg
file, most notably on Linux and MacOS because the extension is not in
the mime type database. This manifests as a file being uploaded with empty
content-type.

To allow these files to go through, the serializer must allow empty
values for the 'type' field which contains the detected content type,
and the backend must perform additional processing to determine the file
type. We can do this by falling back to the generic case of 'binary
file' (application/octet-stream) content type, and let libmagic figure
out which extensions belong to the magic bytes, i.e. we look at the
magic bytes to figure out what kind of file was provided, and we check
the provided file extensions against the list of valid extensions for
the detected file type.
  • Loading branch information
robinmolen authored and sergei-maertens committed Dec 30, 2024
1 parent 32b30d4 commit 5cf6ed0
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 22 deletions.
45 changes: 31 additions & 14 deletions src/openforms/formio/api/validators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from pathlib import Path
from typing import Iterable

from django.core.files.uploadedfile import UploadedFile
Expand Down Expand Up @@ -56,29 +57,40 @@ def __init__(self, allowed_mime_types: Iterable[str] | None = None):

def __call__(self, value: UploadedFile) -> None:
head = value.read(2048)
ext = value.name.split(".")[-1]
mime_type = magic.from_buffer(head, mime=True)
ext = Path(value.name or "").suffix[1:]
detected_mime_type = magic.from_buffer(head, mime=True)
provided_mime_type = value.content_type or "application/octet-stream"

# gh #2520
# application/x-ole-storage on Arch with shared-mime-info 2.0+155+gf4e7cbc-1
if mime_type in ["application/CDFV2", "application/x-ole-storage"]:
if detected_mime_type in ["application/CDFV2", "application/x-ole-storage"]:
whole_file = head + value.read()
mime_type = magic.from_buffer(whole_file, mime=True)
detected_mime_type = magic.from_buffer(whole_file, mime=True)

if mime_type == "image/heif":
mime_type = "image/heic"
if detected_mime_type == "image/heif":
detected_mime_type = "image/heic"

if not (
self.any_allowed
or mimetype_allowed(mime_type, self._regular_mimes, self._wildcard_mimes)
or mimetype_allowed(
detected_mime_type, self._regular_mimes, self._wildcard_mimes
)
):
raise serializers.ValidationError(
_("The provided file is not a valid file type.")
)

if not ext:
raise serializers.ValidationError(
_(
"Could not determine the file type. Please make sure the file name "
"has an extension."
)
)

# Contents is allowed. Do extension or submitted content_type agree?
if value.content_type == "application/octet-stream":
m = magic.Magic(extension=True)
if provided_mime_type == "application/octet-stream":
m = magic.Magic(extension=True) # pyright: ignore[reportCallIssue]
extensions = m.from_buffer(head).split("/")
# magic db doesn't know any more specific extension(s), so accept the
# file
Expand All @@ -101,27 +113,32 @@ def __call__(self, value: UploadedFile) -> None:
# If the file does not strictly follow the conventions of CSV (e.g. non-standard delimiters),
# may not be considered as a valid CSV.
elif (
value.content_type == "text/csv"
and mime_type == "text/plain"
provided_mime_type == "text/csv"
and detected_mime_type == "text/plain"
and ext == "csv"
):
return
elif mime_type == "image/heic" and value.content_type in (
elif detected_mime_type == "image/heic" and provided_mime_type in (
"image/heic",
"image/heif",
):
return
# 4795
# The sdk cannot determine the file type of .msg files, which result into
# content_type "". So we have to validate these for ourselves
elif detected_mime_type == "application/vnd.ms-outlook" and ext == "msg":
return

# gh #4658
# Windows use application/x-zip-compressed as a mimetype for .zip files, which
# is deprecated but still we need to support it. Instead, the common case for
# zip files is application/zip or application/zip-compressed mimetype.
elif mime_type == "application/zip" and value.content_type in (
elif detected_mime_type == "application/zip" and provided_mime_type in (
"application/zip-compressed",
"application/x-zip-compressed",
):
return
elif mime_type != value.content_type:
elif provided_mime_type != detected_mime_type:
raise serializers.ValidationError(
_("The provided file is not a {file_type}.").format(
filename=value.name, file_type=f".{ext}"
Expand Down
9 changes: 1 addition & 8 deletions src/openforms/formio/components/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,7 @@ class FileSerializer(serializers.Serializer):
originalName = serializers.CharField(trim_whitespace=False)
size = serializers.IntegerField(min_value=0)
storage = serializers.ChoiceField(choices=["url"])
type = serializers.CharField(
error_messages={
"blank": _(
"Could not determine the file type. Please make sure the file name "
"has an extension."
),
}
)
type = serializers.CharField(required=True, allow_blank=True)
url = serializers.URLField()
data = FileDataSerializer() # type: ignore

Expand Down

0 comments on commit 5cf6ed0

Please sign in to comment.