Don't load images / documents into memory when calculating their hash

This can be done in chunks, which prevents crashes when uploading large files.
wagtail · Apr 3, 2023 · 4b38388 · 4b38388
1 parent dab2422
commit 4b38388
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 13 deletions.
diff --git a/wagtail/documents/models.py b/wagtail/documents/models.py
@@ -1,4 +1,3 @@
-import hashlib
 import os.path
 import urllib
 from contextlib import contextmanager
@@ -15,6 +14,7 @@
 from wagtail.models import CollectionMember, ReferenceIndex
 from wagtail.search import index
 from wagtail.search.queryset import SearchableQuerySetMixin
+from wagtail.utils.file import hash_filelike
 
 
 class DocumentQuerySet(SearchableQuerySetMixin, models.QuerySet):
@@ -122,14 +122,13 @@ def get_file_size(self):
 
  return self.file_size
 
- def _set_file_hash(self, file_contents):
- self.file_hash = hashlib.sha1(file_contents).hexdigest()
+ def _set_file_hash(self):
+ with self.open_file() as f:
+ self.file_hash = hash_filelike(f)
 
  def get_file_hash(self):
  if self.file_hash == "":
- with self.open_file() as f:
- self._set_file_hash(f.read())
-
+ self._set_file_hash()
  self.save(update_fields=["file_hash"])
 
  return self.file_hash
@@ -141,7 +140,7 @@ def _set_document_file_metadata(self):
  self.file_size = self.file.size
 
  # Set new document file hash
- self._set_file_hash(self.file.read())
+ self._set_file_hash()
  self.file.seek(0)
 
  def __str__(self):

diff --git a/wagtail/documents/tests/test_models.py b/wagtail/documents/tests/test_models.py
@@ -126,6 +126,19 @@ def test_content_type(self):
  "application/octet-stream", self.extensionless_document.content_type
  )
 
+ def test_file_hash(self):
+ self.assertEqual(
+ self.document.get_file_hash(), "7d8c4778b182e4f3bd442408c64a6e22a4b0ed85"
+ )
+ self.assertEqual(
+ self.pdf_document.get_file_hash(),
+ "7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
+ )
+ self.assertEqual(
+ self.extensionless_document.get_file_hash(),
+ "7d8c4778b182e4f3bd442408c64a6e22a4b0ed85",
+ )
+
  def test_content_disposition(self):
  self.assertEqual(
  """attachment; filename=example.doc; filename*=UTF-8''example.doc""",

diff --git a/wagtail/images/models.py b/wagtail/images/models.py
@@ -38,6 +38,7 @@
 from wagtail.models import CollectionMember, ReferenceIndex
 from wagtail.search import index
 from wagtail.search.queryset import SearchableQuerySetMixin
+from wagtail.utils.file import hash_filelike
 
 logger = logging.getLogger("wagtail.images")
 
@@ -225,14 +226,13 @@ class AbstractImage(ImageFileMixin, CollectionMember, index.Indexed, models.Mode
 
  objects = ImageQuerySet.as_manager()
 
- def _set_file_hash(self, file_contents):
- self.file_hash = hashlib.sha1(file_contents).hexdigest()
+ def _set_file_hash(self):
+ with self.open_file() as f:
+ self.file_hash = hash_filelike(f)
 
  def get_file_hash(self):
  if self.file_hash == "":
- with self.open_file() as f:
- self._set_file_hash(f.read())
-
+ self._set_file_hash()
  self.save(update_fields=["file_hash"])
 
  return self.file_hash
@@ -244,7 +244,7 @@ def _set_image_file_metadata(self):
  self.file_size = self.file.size
 
  # Set new image file hash
- self._set_file_hash(self.file.read())
+ self._set_file_hash()
  self.file.seek(0)
 
  def get_upload_to(self, filename):

diff --git a/wagtail/images/tests/test_models.py b/wagtail/images/tests/test_models.py
@@ -112,6 +112,11 @@ def test_get_file_size_on_missing_file_raises_sourceimageioerror(self):
  with self.assertRaises(SourceImageIOError):
  self.image.get_file_size()
 
+ def test_file_hash(self):
+ self.assertEqual(
+ self.image.get_file_hash(), "4dd0211870e130b7e1690d2ec53c499a54a48fef"
+ )
+
 
 class TestImageQuerySet(TestCase):
  def test_search_method(self):

diff --git a/wagtail/tests/test_utils.py b/wagtail/tests/test_utils.py
@@ -1,8 +1,11 @@
 # -*- coding: utf-8 -*
 import pickle
+from io import BytesIO, StringIO
+from pathlib import Path
 
 from django.contrib.contenttypes.models import ContentType
 from django.core.exceptions import ImproperlyConfigured, SuspiciousOperation
+from django.core.files.uploadedfile import SimpleUploadedFile
 from django.test import SimpleTestCase, TestCase, override_settings
 from django.utils.text import slugify
 from django.utils.translation import _trans
@@ -23,6 +26,7 @@
  string_to_ascii,
 )
 from wagtail.models import Page, Site
+from wagtail.utils.file import hash_filelike
 from wagtail.utils.utils import deep_update
 
 
@@ -508,3 +512,58 @@ def test_deep_update(self):
  "starship": "enterprise",
  },
  )
+
+
+class HashFileLikeTestCase(SimpleTestCase):
+ test_file = Path.cwd() / "LICENSE"
+
+ def test_hashes_io(self):
+ self.assertEqual(
+ hash_filelike(BytesIO(b"test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
+ )
+ self.assertEqual(
+ hash_filelike(StringIO("test")), "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"
+ )
+
+ def test_hashes_file(self):
+ with self.test_file.open(mode="r") as f:
+ self.assertEqual(
+ hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
+ )
+
+ def test_hashes_file_bytes(self):
+ with self.test_file.open(mode="rb") as f:
+ self.assertEqual(
+ hash_filelike(f), "9e58400061ca660ef7b5c94338a5205627c77eda"
+ )
+
+ def test_hashes_django_uploaded_file(self):
+ """
+ Check Django's file shims can be hashed as-is.
+ `SimpleUploadedFile` inherits the base `UploadedFile`, but is easiest to test against
+ """
+ self.assertEqual(
+ hash_filelike(SimpleUploadedFile("example.txt", b"test")),
+ "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3",
+ )
+
+ def test_hashes_large_file(self):
+ class FakeLargeFile:
+ """
+ A class that pretends to be a huge file (~1.3GB)
+ """
+
+ def __init__(self):
+ self.iterations = 20000
+
+ def read(self, bytes):
+ self.iterations -= 1
+ if not self.iterations:
+ return b""
+
+ return b"A" * bytes
+
+ self.assertEqual(
+ hash_filelike(FakeLargeFile()),
+ "187cc1db32624dccace20d042f6d631f1a483020",
+ )
diff --git a/wagtail/utils/file.py b/wagtail/utils/file.py
@@ -0,0 +1,35 @@
+from hashlib import sha1
+from io import UnsupportedOperation
+
+from django.utils.encoding import force_bytes
+
+HASH_READ_SIZE = 65536 # 64k
+
+
+def hash_filelike(filelike):
+ """
+ Compute the hash of a file-like object, without loading it all into memory.
+ """
+ file_pos = 0
+ if hasattr(filelike, "tell"):
+ file_pos = filelike.tell()
+
+ try:
+ # Reset file handler to the start of the file so we hash it all
+ filelike.seek(0)
+ except (AttributeError, UnsupportedOperation):
+ pass
+
+ hasher = sha1()
+ while True:
+ data = filelike.read(HASH_READ_SIZE)
+ if not data:
+ break
+ # Use `force_bytes` to account for files opened as text
+ hasher.update(force_bytes(data))
+
+ if hasattr(filelike, "seek"):
+ # Reset the file handler to where it was before
+ filelike.seek(file_pos)
+
+ return hasher.hexdigest()