Skip to content

Commit

Permalink
Don't load images / documents into memory when calculating their hash
Browse files Browse the repository at this point in the history
This can be done in chunks, which prevents crashes when uploading large files.
  • Loading branch information
RealOrangeOne authored and gasman committed Apr 3, 2023
1 parent dab2422 commit 4b38388
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 13 deletions.
13 changes: 6 additions & 7 deletions wagtail/documents/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import os.path
import urllib
from contextlib import contextmanager
Expand All @@ -15,6 +14,7 @@
from wagtail.models import CollectionMember, ReferenceIndex
from wagtail.search import index
from wagtail.search.queryset import SearchableQuerySetMixin
from wagtail.utils.file import hash_filelike


class DocumentQuerySet(SearchableQuerySetMixin, models.QuerySet):
Expand Down Expand Up @@ -122,14 +122,13 @@ def get_file_size(self):

return self.file_size

def _set_file_hash(self, file_contents):
self.file_hash = hashlib.sha1(file_contents).hexdigest()
def _set_file_hash(self):
with self.open_file() as f:
self.file_hash = hash_filelike(f)

def get_file_hash(self):
if self.file_hash == "":
with self.open_file() as f:
self._set_file_hash(f.read())

self._set_file_hash()
self.save(update_fields=["file_hash"])

return self.file_hash
Expand All @@ -141,7 +140,7 @@ def _set_document_file_metadata(self):
self.file_size = self.file.size

# Set new document file hash
self._set_file_hash(self.file.read())
self._set_file_hash()
self.file.seek(0)

def __str__(self):
Expand Down
13 changes: 13 additions & 0 deletions wagtail/documents/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,19 @@ def test_content_type(self):
"application/octet-stream", self.extensionless_document.content_type
)

def test_file_hash(self):
self.assertEqual(
self.document.get_file_hash(), "7d8c4778b182e4f3bd442408c64a6e22a4b0ed85"
)
self.assertEqual(
self.pdf_document.get_file_hash(),
"7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
)
self.assertEqual(
self.extensionless_document.get_file_hash(),
"7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
)

def test_content_disposition(self):
self.assertEqual(
"""attachment; filename=example.doc; filename*=UTF-8''example.doc""",
Expand Down
12 changes: 6 additions & 6 deletions wagtail/images/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from wagtail.models import CollectionMember, ReferenceIndex
from wagtail.search import index
from wagtail.search.queryset import SearchableQuerySetMixin
from wagtail.utils.file import hash_filelike

logger = logging.getLogger("wagtail.images")

Expand Down Expand Up @@ -225,14 +226,13 @@ class AbstractImage(ImageFileMixin, CollectionMember, index.Indexed, models.Mode

objects = ImageQuerySet.as_manager()

def _set_file_hash(self, file_contents):
self.file_hash = hashlib.sha1(file_contents).hexdigest()
def _set_file_hash(self):
with self.open_file() as f:
self.file_hash = hash_filelike(f)

def get_file_hash(self):
if self.file_hash == "":
with self.open_file() as f:
self._set_file_hash(f.read())

self._set_file_hash()
self.save(update_fields=["file_hash"])

return self.file_hash
Expand All @@ -244,7 +244,7 @@ def _set_image_file_metadata(self):
self.file_size = self.file.size

# Set new image file hash
self._set_file_hash(self.file.read())
self._set_file_hash()
self.file.seek(0)

def get_upload_to(self, filename):
Expand Down
5 changes: 5 additions & 0 deletions wagtail/images/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def test_get_file_size_on_missing_file_raises_sourceimageioerror(self):
with self.assertRaises(SourceImageIOError):
self.image.get_file_size()

def test_file_hash(self):
self.assertEqual(
self.image.get_file_hash(), "4dd0211870e130b7e1690d2ec53c499a54a48fef"
)


class TestImageQuerySet(TestCase):
def test_search_method(self):
Expand Down
59 changes: 59 additions & 0 deletions wagtail/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# -*- coding: utf-8 -*
import pickle
from io import BytesIO, StringIO
from pathlib import Path

from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import ImproperlyConfigured, SuspiciousOperation
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import SimpleTestCase, TestCase, override_settings
from django.utils.text import slugify
from django.utils.translation import _trans
Expand All @@ -23,6 +26,7 @@
string_to_ascii,
)
from wagtail.models import Page, Site
from wagtail.utils.file import hash_filelike
from wagtail.utils.utils import deep_update


Expand Down Expand Up @@ -508,3 +512,58 @@ def test_deep_update(self):
"starship": "enterprise",
},
)


class HashFileLikeTestCase(SimpleTestCase):
test_file = Path.cwd() / "LICENSE"

def test_hashes_io(self):
self.assertEqual(
hash_filelike(BytesIO(b"test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
)
self.assertEqual(
hash_filelike(StringIO("test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
)

def test_hashes_file(self):
with self.test_file.open(mode="r") as f:
self.assertEqual(
hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
)

def test_hashes_file_bytes(self):
with self.test_file.open(mode="rb") as f:
self.assertEqual(
hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
)

def test_hashes_django_uploaded_file(self):
"""
Check Django's file shims can be hashed as-is.
`SimpleUploadedFile` inherits the base `UploadedFile`, but is easiest to test against
"""
self.assertEqual(
hash_filelike(SimpleUploadedFile("example.txt", b"test")),
"a94a8fe5ccb19ba61c4c0873d391e987982fbbd3",
)

def test_hashes_large_file(self):
class FakeLargeFile:
"""
A class that pretends to be a huge file (~1.3GB)
"""

def __init__(self):
self.iterations = 20000

def read(self, bytes):
self.iterations -= 1
if not self.iterations:
return b""

return b"A" * bytes

self.assertEqual(
hash_filelike(FakeLargeFile()),
"187cc1db32624dccace20d042f6d631f1a483020",
)
35 changes: 35 additions & 0 deletions wagtail/utils/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from hashlib import sha1
from io import UnsupportedOperation

from django.utils.encoding import force_bytes

HASH_READ_SIZE = 65536 # 64k


def hash_filelike(filelike):
"""
Compute the hash of a file-like object, without loading it all into memory.
"""
file_pos = 0
if hasattr(filelike, "tell"):
file_pos = filelike.tell()

try:
# Reset file handler to the start of the file so we hash it all
filelike.seek(0)
except (AttributeError, UnsupportedOperation):
pass

hasher = sha1()
while True:
data = filelike.read(HASH_READ_SIZE)
if not data:
break
# Use `force_bytes` to account for files opened as text
hasher.update(force_bytes(data))

if hasattr(filelike, "seek"):
# Reset the file handler to where it was before
filelike.seek(file_pos)

return hasher.hexdigest()

0 comments on commit 4b38388

Please sign in to comment.