Automatically split large files when copying

learningequality · Jun 28, 2022 · 05900fd · 05900fd
1 parent a74e193
commit 05900fd
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 22 deletions.
diff --git a/ricecooker/classes/files.py b/ricecooker/classes/files.py
@@ -29,6 +29,7 @@
 from ..exceptions import UnknownFileTypeError
 from ricecooker.utils.encodings import get_base64_encoding
 from ricecooker.utils.encodings import write_base64_to_file
+from ricecooker.utils.file_slice import FileSlice
 from ricecooker.utils.images import create_image_from_epub
 from ricecooker.utils.images import create_image_from_pdf_page
 from ricecooker.utils.images import create_image_from_zip
@@ -161,12 +162,12 @@ def download(path, default_ext=None):
         # Get extension of file or use `default_ext` if none found
         if not ext:
             ext = extract_path_ext(path, default_ext=default_ext)
-        filename = copy_file_to_storage(tempf.name, ext=ext)
-        FILECACHE.set(key, bytes(filename, "utf-8"))
-        config.LOGGER.info("\t--- Downloaded {}".format(filename))
+        filenames = copy_file_to_storage(tempf.name, ext=ext)
+        FILECACHE.set(key, bytes(",".join(filenames), "utf-8"))
+        config.LOGGER.info("\t--- Downloaded {}".format(filenames))
         os.unlink(tempf.name)
 
-    return filename, ext
+    return filenames, ext
 
 
 def download_and_convert_video(path, ffmpeg_settings=None):
@@ -242,29 +243,48 @@ def write_path_to_filename(path, write_to_file):
 
 
 def get_hash(filepath):
-    file_hash = hashlib.md5()
     with open(filepath, "rb") as fobj:
-        for chunk in iter(lambda: fobj.read(2097152), b""):
-            file_hash.update(chunk)
+        return get_hash_from_fd(fobj)
+
+
+def get_hash_from_fd(fobj):
+    file_hash = hashlib.md5()
+    for chunk in iter(lambda: fobj.read(2097152), b""):
+        file_hash.update(chunk)
     return file_hash.hexdigest()
 
 
-def copy_file_to_storage(srcfilename, ext=None):
+# 10 MB in bytes
+FILE_SIZE_MAX_BYTES = 10000000
+
+
+def copy_file_to_storage(src_file_name, ext=None, chunk_size=FILE_SIZE_MAX_BYTES):
     """
-    Copy `srcfilename` (filepath) to destination.
+    Copy `src_file_name` (filepath) to destination.
+    The file will be broken into parts if its size exceeds `chunk_size`.
     :rtype: None
     """
     if ext is None:
-        ext = extract_path_ext(srcfilename)
+        ext = extract_path_ext(src_file_name)
 
-    hash = get_hash(srcfilename)
-    filename = "{}.{}".format(hash, ext)
-    try:
-        shutil.copy(srcfilename, config.get_storage_path(filename))
-    except shutil.SameFileError:
-        pass
+    filenames = []
 
-    return filename
+    with open(src_file_name, "rb") as src_fd:
+        slices = list(FileSlice.from_file(src_fd, chunk_size))
+
+        for slice in slices:
+            slice_hash = get_hash_from_fd(slice)
+            slice.seek(0)
+
+            file_name = "{}.{}".format(slice_hash, ext)
+            storage_path = config.get_storage_path(file_name)
+
+            with open(storage_path, "wb") as out_fd:
+                shutil.copyfileobj(slice, out_fd)
+
+            filenames.append(file_name)
+
+    return filenames
 
 
 def compress_video_file(filename, ffmpeg_settings):
@@ -386,6 +406,9 @@ class File(object):
     language = None
     assessment_item = None
     is_primary = False
+    # Supplementary files are additional File objects which have been
+    # discovered that must be tracked in addition to this one.
+    supplementary_files = []
 
     def __init__(self, preset=None, language=None, default_ext=None, source_url=None):
         self.preset = preset
@@ -490,22 +513,59 @@ def validate(self):
 
     def process_file(self):
         try:
-            self.filename, self.ext = download(self.path, default_ext=self.default_ext)
+            filenames, self.ext = download(self.path, default_ext=self.default_ext)
             # don't validate for single-digit extension, or no extension
             if not self.ext:
                 self.ext = extract_path_ext(self.path)
-            return self.filename
         # Catch errors related to reading file path and handle silently
         except HTTP_CAUGHT_EXCEPTIONS as err:
             self.error = str(err)
             config.LOGGER.debug("Failed to download, error is: {}".format(err))
             config.FAILED_FILES.append(self)
             return None
 
+        supplementary_files = []
+
+        if isinstance(filenames, list):
+            self.filename = filenames[0]
+            for extra_filename in filenames[1:]:
+                extra_file = SplitFile(
+                    self,
+                    extra_filename,
+                    self.ext,
+                    preset=self.preset,
+                    language=self.language,
+                    default_ext=self.default_ext,
+                    source_url=self.source_url,
+                )
+                supplementary_files.append(extra_file)
+        else:
+            self.filename = filenames
+
+        self.supplementary_files = supplementary_files
+
+        return self.filename
+
     def __str__(self):
         return self.path
 
 
+class SplitFile(File):
+    # FIXME: Move this to the ZimNode / ZimFile, and adjust DownloadFile so it
+    #        only creates split files if it is supported.
+    def __init__(self, base_file, filename, ext, **kwargs):
+        super(SplitFile, self).__init__(**kwargs)
+        self.base_file = base_file
+        self.filename = filename
+        self.ext = ext
+
+    def __str__(self):
+        return "{} split {}".format(self.base_file, self.filename)
+
+    def get_preset(self):
+        return self.base_file.get_preset()
+
+
 IMAGE_EXTENSIONS = {
     file_formats.PNG,
     file_formats.JPG,

diff --git a/ricecooker/classes/nodes.py b/ricecooker/classes/nodes.py
@@ -181,9 +181,14 @@ def process_files(self):
         - (optionally) generate thumbnail file from the node's content
         Returns: content-hash based filenames of all the files for this node
         """
-        filenames = []
+        extra_files = []
+
         for file in self.files:
-            filenames.append(file.process_file())
+            file.process_file()
+            extra_files.extend(file.supplementary_files)
+        self.files.extend(extra_files)
+
+        filenames = [file.filename for file in self.files]
 
         # Auto-generation of thumbnails happens here if derive_thumbnail or config.THUMBNAILS is set
         if not self.has_thumbnail() and (config.THUMBNAILS or self.derive_thumbnail):
@@ -198,7 +203,7 @@ def process_files(self):
             else:
                 pass  # method generate_thumbnail is not implemented or no suitable source file found
 
-        return filenames
+        return tuple(filenames)
 
     def count(self):
         """count: get number of nodes in tree

diff --git a/ricecooker/utils/file_slice.py b/ricecooker/utils/file_slice.py
@@ -0,0 +1,71 @@
+class FileSlice(object):
+    """
+    File-like object that represents a slice of a file, starting from its
+    current offset until `count`. Reads are always relative to the slice's
+    start and end point.
+    """
+
+    def __init__(self, file, count):
+        self.file = file
+        self.start = file.tell()
+
+        file.seek(0, 2)
+        self.file_size = file.tell()
+
+        count = min(self.file_size - self.start, count)
+        self.end = self.start + count
+
+        # Seek to the end of the file so the next FileSlice object will be
+        # created from that point.
+        file.seek(self.end)
+
+        self.__last_offset = self.start
+
+    @classmethod
+    def from_file(cls, file, chunk_size):
+        slice = cls(file, chunk_size)
+        yield slice
+
+        while slice.end < slice.file_size:
+            slice = cls(file, chunk_size)
+            yield slice
+
+    @property
+    def size(self):
+        return self.end - self.start
+
+    def seek(self, offset, whence=0):
+        if whence == 0:
+            offset = self.start + offset
+        elif whence == 1:
+            offset = self.tell() + offset
+        elif whence == 2:
+            offset = self.end - offset
+        self.file.seek(offset)
+        self.__store_offset()
+
+    def __reset_offset(self):
+        if self.file.tell() != self.__last_offset:
+            self.file.seek(self.__last_offset)
+
+    def __store_offset(self):
+        self.__last_offset = self.file.tell()
+
+    def tell(self):
+        self.__reset_offset()
+        return self.file.tell() - self.start
+
+    def read(self, count=None):
+        self.__reset_offset()
+
+        if count is None:
+            count = self.size
+
+        remaining = max(0, self.size - self.tell())
+
+        buffer = self.file.read(min(count, remaining))
+        self.__store_offset()
+        return buffer
+
+    def write(self, string):
+        raise NotImplementedError()