Skip to content

Commit

Permalink
Automatically split large files when copying
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanmccall committed Jun 28, 2022
1 parent a74e193 commit 05900fd
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 22 deletions.
98 changes: 79 additions & 19 deletions ricecooker/classes/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ..exceptions import UnknownFileTypeError
from ricecooker.utils.encodings import get_base64_encoding
from ricecooker.utils.encodings import write_base64_to_file
from ricecooker.utils.file_slice import FileSlice
from ricecooker.utils.images import create_image_from_epub
from ricecooker.utils.images import create_image_from_pdf_page
from ricecooker.utils.images import create_image_from_zip
Expand Down Expand Up @@ -161,12 +162,12 @@ def download(path, default_ext=None):
# Get extension of file or use `default_ext` if none found
if not ext:
ext = extract_path_ext(path, default_ext=default_ext)
filename = copy_file_to_storage(tempf.name, ext=ext)
FILECACHE.set(key, bytes(filename, "utf-8"))
config.LOGGER.info("\t--- Downloaded {}".format(filename))
filenames = copy_file_to_storage(tempf.name, ext=ext)
FILECACHE.set(key, bytes(",".join(filenames), "utf-8"))
config.LOGGER.info("\t--- Downloaded {}".format(filenames))
os.unlink(tempf.name)

return filename, ext
return filenames, ext


def download_and_convert_video(path, ffmpeg_settings=None):
Expand Down Expand Up @@ -242,29 +243,48 @@ def write_path_to_filename(path, write_to_file):


def get_hash(filepath):
file_hash = hashlib.md5()
with open(filepath, "rb") as fobj:
for chunk in iter(lambda: fobj.read(2097152), b""):
file_hash.update(chunk)
return get_hash_from_fd(fobj)


def get_hash_from_fd(fobj):
file_hash = hashlib.md5()
for chunk in iter(lambda: fobj.read(2097152), b""):
file_hash.update(chunk)
return file_hash.hexdigest()


def copy_file_to_storage(srcfilename, ext=None):
# 10 MB in bytes
FILE_SIZE_MAX_BYTES = 10000000


def copy_file_to_storage(src_file_name, ext=None, chunk_size=FILE_SIZE_MAX_BYTES):
"""
Copy `srcfilename` (filepath) to destination.
Copy `src_file_name` (filepath) to destination.
The file will be broken into parts if its size exceeds `chunk_size`.
:rtype: None
"""
if ext is None:
ext = extract_path_ext(srcfilename)
ext = extract_path_ext(src_file_name)

hash = get_hash(srcfilename)
filename = "{}.{}".format(hash, ext)
try:
shutil.copy(srcfilename, config.get_storage_path(filename))
except shutil.SameFileError:
pass
filenames = []

return filename
with open(src_file_name, "rb") as src_fd:
slices = list(FileSlice.from_file(src_fd, chunk_size))

for slice in slices:
slice_hash = get_hash_from_fd(slice)
slice.seek(0)

file_name = "{}.{}".format(slice_hash, ext)
storage_path = config.get_storage_path(file_name)

with open(storage_path, "wb") as out_fd:
shutil.copyfileobj(slice, out_fd)

filenames.append(file_name)

return filenames


def compress_video_file(filename, ffmpeg_settings):
Expand Down Expand Up @@ -386,6 +406,9 @@ class File(object):
language = None
assessment_item = None
is_primary = False
# Supplementary files are additional File objects which have been
# discovered that must be tracked in addition to this one.
supplementary_files = []

def __init__(self, preset=None, language=None, default_ext=None, source_url=None):
self.preset = preset
Expand Down Expand Up @@ -490,22 +513,59 @@ def validate(self):

def process_file(self):
try:
self.filename, self.ext = download(self.path, default_ext=self.default_ext)
filenames, self.ext = download(self.path, default_ext=self.default_ext)
# don't validate for single-digit extension, or no extension
if not self.ext:
self.ext = extract_path_ext(self.path)
return self.filename
# Catch errors related to reading file path and handle silently
except HTTP_CAUGHT_EXCEPTIONS as err:
self.error = str(err)
config.LOGGER.debug("Failed to download, error is: {}".format(err))
config.FAILED_FILES.append(self)
return None

supplementary_files = []

if isinstance(filenames, list):
self.filename = filenames[0]
for extra_filename in filenames[1:]:
extra_file = SplitFile(
self,
extra_filename,
self.ext,
preset=self.preset,
language=self.language,
default_ext=self.default_ext,
source_url=self.source_url,
)
supplementary_files.append(extra_file)
else:
self.filename = filenames

self.supplementary_files = supplementary_files

return self.filename

def __str__(self):
return self.path


class SplitFile(File):
# FIXME: Move this to the ZimNode / ZimFile, and adjust DownloadFile so it
# only creates split files if it is supported.
def __init__(self, base_file, filename, ext, **kwargs):
super(SplitFile, self).__init__(**kwargs)
self.base_file = base_file
self.filename = filename
self.ext = ext

def __str__(self):
return "{} split {}".format(self.base_file, self.filename)

def get_preset(self):
return self.base_file.get_preset()


IMAGE_EXTENSIONS = {
file_formats.PNG,
file_formats.JPG,
Expand Down
11 changes: 8 additions & 3 deletions ricecooker/classes/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,14 @@ def process_files(self):
- (optionally) generate thumbnail file from the node's content
Returns: content-hash based filenames of all the files for this node
"""
filenames = []
extra_files = []

for file in self.files:
filenames.append(file.process_file())
file.process_file()
extra_files.extend(file.supplementary_files)
self.files.extend(extra_files)

filenames = [file.filename for file in self.files]

# Auto-generation of thumbnails happens here if derive_thumbnail or config.THUMBNAILS is set
if not self.has_thumbnail() and (config.THUMBNAILS or self.derive_thumbnail):
Expand All @@ -198,7 +203,7 @@ def process_files(self):
else:
pass # method generate_thumbnail is not implemented or no suitable source file found

return filenames
return tuple(filenames)

def count(self):
"""count: get number of nodes in tree
Expand Down
71 changes: 71 additions & 0 deletions ricecooker/utils/file_slice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
class FileSlice(object):
"""
File-like object that represents a slice of a file, starting from its
current offset until `count`. Reads are always relative to the slice's
start and end point.
"""

def __init__(self, file, count):
self.file = file
self.start = file.tell()

file.seek(0, 2)
self.file_size = file.tell()

count = min(self.file_size - self.start, count)
self.end = self.start + count

# Seek to the end of the file so the next FileSlice object will be
# created from that point.
file.seek(self.end)

self.__last_offset = self.start

@classmethod
def from_file(cls, file, chunk_size):
slice = cls(file, chunk_size)
yield slice

while slice.end < slice.file_size:
slice = cls(file, chunk_size)
yield slice

@property
def size(self):
return self.end - self.start

def seek(self, offset, whence=0):
if whence == 0:
offset = self.start + offset
elif whence == 1:
offset = self.tell() + offset
elif whence == 2:
offset = self.end - offset
self.file.seek(offset)
self.__store_offset()

def __reset_offset(self):
if self.file.tell() != self.__last_offset:
self.file.seek(self.__last_offset)

def __store_offset(self):
self.__last_offset = self.file.tell()

def tell(self):
self.__reset_offset()
return self.file.tell() - self.start

def read(self, count=None):
self.__reset_offset()

if count is None:
count = self.size

remaining = max(0, self.size - self.tell())

buffer = self.file.read(min(count, remaining))
self.__store_offset()
return buffer

def write(self, string):
raise NotImplementedError()

0 comments on commit 05900fd

Please sign in to comment.