Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle frontend loaded zip archive content (h5p, bloomd) that contains large files #12805

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions kolibri/core/content/test/test_zipcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,111 @@ def test_delete_not_allowed(self):
response = self._get_file(self.test_name_1, REQUEST_METHOD="DELETE")
self.assertEqual(response.status_code, 405)

def test_range_request_full_file(self):
"""Ensure normal request works with Accept-Ranges header"""
response = self._get_file(self.test_name_1)
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
self.assertEqual(response.headers["Accept-Ranges"], "bytes")
self.assertEqual(response.status_code, 200)

def test_range_request_partial_file(self):
"""Test successful range request for partial file"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=2-5")
self.assertEqual(
next(response.streaming_content).decode(), self.test_str_1[2:6]
)
self.assertEqual(response.status_code, 206)
self.assertEqual(
response.headers["Content-Range"], f"bytes 2-5/{len(self.test_str_1)}"
)
self.assertEqual(response.headers["Content-Length"], "4")
self.assertEqual(response.headers["Accept-Ranges"], "bytes")

def test_range_request_end_of_file(self):
"""Test range request for last few bytes of file"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=-4")
self.assertEqual(
next(response.streaming_content).decode(), self.test_str_1[-4:]
)
self.assertEqual(response.status_code, 206)

def test_range_request_beyond_eof(self):
"""Test range request with start beyond file size"""
response = self._get_file(
self.test_name_1,
HTTP_RANGE=f"bytes={len(self.test_str_1) + 1}-{len(self.test_str_1) + 4}",
)
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_malformed(self):
"""Test malformed range header"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=invalid")
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_multiple_ranges(self):
"""Test multiple ranges - should return full file as we don't support multipart responses"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=0-2,4-6")
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_html_file(self):
"""Test range requests on HTML files that get modified - should return full file"""
response = self._get_file(self.script_name, HTTP_RANGE="bytes=0-10")
# Should return full modified file, not range
content = (
"<html><head>{}<script>test</script></head><body></body></html>".format(
hashi_injection
)
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode("utf-8"), content)

def test_range_request_large_file(self):
"""Test range request on a larger file to verify streaming"""
large_str = "Large text file " * 1024 # ~16KB file
large_file = "large.txt"

with zipfile.ZipFile(self.zip_path, "a") as zf:
zf.writestr(large_file, large_str)

# Request middle section of file
start = 1024
end = 2048
response = self._get_file(large_file, HTTP_RANGE=f"bytes={start}-{end}")

content = next(response.streaming_content).decode()
self.assertEqual(content, large_str[start : end + 1])
self.assertEqual(response.status_code, 206)
self.assertEqual(response.headers["Content-Length"], str(end - start + 1))

def test_options_request_accept_ranges_html(self):
"""Test OPTIONS request for HTML file returns Accept-Ranges: none"""
response = self._get_file(self.script_name, REQUEST_METHOD="OPTIONS")
self.assertEqual(response.headers["Accept-Ranges"], "none")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode(), "")

def test_options_request_accept_ranges_binary(self):
"""Test OPTIONS request for non-HTML file returns Accept-Ranges: bytes"""
response = self._get_file(
self.test_name_1, REQUEST_METHOD="OPTIONS" # This is a .txt file
)
self.assertEqual(response.headers["Accept-Ranges"], "bytes")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode(), "")

def test_accept_ranges_header_html(self):
"""Test Accept-Ranges header is 'none' for HTML files"""
response = self._get_file(self.script_name) # HTML file
self.assertEqual(response.headers["Accept-Ranges"], "none")

def test_accept_ranges_header_binary(self):
"""Test Accept-Ranges header is 'bytes' for non-HTML files"""
response = self._get_file(self.test_name_1) # txt file
self.assertEqual(response.headers["Accept-Ranges"], "bytes")


@override_option("Deployment", "ZIP_CONTENT_URL_PATH_PREFIX", "prefix_test/")
class UrlPrefixZipContentTestCase(ZipContentTestCase):
Expand Down
137 changes: 120 additions & 17 deletions kolibri/core/content/zip_wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import mimetypes
import os
import re
import sys
import time
import zipfile
from urllib.parse import unquote
Expand All @@ -20,6 +21,12 @@
from django.utils.cache import patch_response_headers
from django.utils.encoding import force_str
from django.utils.http import http_date
from le_utils.constants.file_formats import BLOOMD
from le_utils.constants.file_formats import BLOOMPUB
from le_utils.constants.file_formats import H5P
from le_utils.constants.file_formats import HTML5
from le_utils.constants.file_formats import PERSEUS
from whitenoise.responders import StaticFile

from kolibri.core.content.errors import InvalidStorageFilenameError
from kolibri.core.content.utils.paths import get_content_storage_file_path
Expand All @@ -32,6 +39,61 @@
logger = logging.getLogger(__name__)


def parse_byte_range(range_header, file_size):
"""Parse Range header using whitenoise's implementation"""
try:
start, end = StaticFile.parse_byte_range(range_header)
if start >= file_size:
# If start is beyond EOF, return None to trigger full file response
return None

if end is None:
end = file_size - 1
else:
end = min(end, file_size - 1)

return (start if start >= 0 else file_size + start, end - start + 1)
except ValueError:
return None


class RangeZipFileObjectWrapper:
"""
A wrapper for a zip file object that supports byte range requests.
This is implemented for compatibility with Python 3.6, which does not
support seeking in file objects extracted from zip files.
This can be removed once Python 3.6 support is dropped.
"""

def __init__(self, file_object, start=0, length=None):
self.file_object = file_object
self.remaining = length
# Python 3.7+ zipfile has seek support
if sys.version_info >= (3, 7):
self.file_object.seek(start)
else:
# Read and discard data until we reach start position
while start > 0:
chunk_size = min(start, 8192)
self.file_object.read(chunk_size)
start -= chunk_size

def __iter__(self):
return self

def __next__(self):
if self.remaining is not None and self.remaining <= 0:
raise StopIteration()
chunk = self.file_object.read(
min(8192, self.remaining if self.remaining is not None else 8192)
)
if not chunk:
raise StopIteration()
if self.remaining is not None:
self.remaining -= len(chunk)
return chunk


def add_security_headers(request, response):
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
Expand Down Expand Up @@ -127,7 +189,9 @@ def parse_html(content):
return content


def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
def get_embedded_file(
zipped_path, zipped_filename, embedded_filepath, range_header=None
):
with zipfile.ZipFile(zipped_path) as zf:
# if no path, or a directory, is being referenced, look for an index.html file
if not embedded_filepath or embedded_filepath.endswith("/"):
Expand All @@ -143,29 +207,57 @@ def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
)
)

# file size
file_size = 0

# try to guess the MIME type of the embedded file being referenced
content_type = (
mimetypes.guess_type(embedded_filepath)[0] or "application/octet-stream"
)
if embedded_filepath.endswith("htm") or embedded_filepath.endswith("html"):
content = zf.open(info).read()
zipped_file_object = zf.open(info)

is_html = embedded_filepath.lower().endswith(("html", "htm"))

if is_html:
content = zipped_file_object.read()
html = parse_html(content)
response = HttpResponse(html, content_type=content_type)
file_size = len(response.content)
else:
# generate a streaming response object, pulling data from within the zip file
response = FileResponse(zf.open(info), content_type=content_type)
status = 200
file_size = info.file_size
range_response_header = None

# handle byte-range requests
if range_header:
range_tuple = parse_byte_range(range_header, file_size)
if range_tuple:
start, length = range_tuple
zipped_file_object = RangeZipFileObjectWrapper(
zipped_file_object, start, length
)
status = 206
# Use the total file size of the object for the Content-Range header
range_response_header = (
f"bytes {start}-{start + length - 1}/{file_size}"
)
# Update the file size to the length of the requested range
file_size = length

response = FileResponse(
zipped_file_object, content_type=content_type, status=status
)
if range_response_header:
response.headers["Content-Range"] = range_response_header

# Only accept byte ranges for files that are not HTML
response.headers["Accept-Ranges"] = "none" if is_html else "bytes"
# set the content-length header to the size of the embedded file
if file_size:
response.headers["Content-Length"] = file_size
response.headers["Content-Length"] = file_size
return response


archive_file_types = (HTML5, H5P, BLOOMPUB, BLOOMD, PERSEUS)
archive_file_extension_match = "|".join(archive_file_types)

# Includes a prefix that is almost certain not to collide
# with a filename embedded in a zip file. Prefix is:
# @*._ followed by the encoded base url
Expand All @@ -176,7 +268,11 @@ def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
# the base URL, rather than having to load the entire zip file before
# loading the HTML5 content.
path_regex = re.compile(
r"/(?:(?P<base_url>(?![a-f0-9]{32}\.zip)[^/]+)/)?(?P<zipped_filename>[a-f0-9]{32}\.zip)/(?P<embedded_filepath>.*)"
r"/(?:(?P<base_url>(?![a-f0-9]{32}\.(?:"
+ archive_file_extension_match
+ r"))[^/]+)/)?(?P<zipped_filename>[a-f0-9]{32}\.(?:"
+ archive_file_extension_match
+ r"))/(?P<embedded_filepath>.*)"
)

YEAR_IN_SECONDS = 60 * 60 * 24 * 365
Expand Down Expand Up @@ -208,11 +304,17 @@ def _zip_content_from_request(request): # noqa: C901
"Path not found: {path}".format(path=request.path_info)
)

if request.method == "OPTIONS":
return HttpResponse()

remote_baseurl, zipped_filename, embedded_filepath = match.groups()

if request.method == "OPTIONS":
response = HttpResponse()
# If path ends with html/htm, set Accept-Ranges to none
if embedded_filepath.lower().endswith(("html", "htm")):
response.headers["Accept-Ranges"] = "none"
else:
response.headers["Accept-Ranges"] = "bytes"
return response

try:
# calculate the local file path to the zip file
zipped_path = get_content_storage_file_path(zipped_filename)
Expand Down Expand Up @@ -277,8 +379,12 @@ def _zip_content_from_request(request): # noqa: C901
if cached_response is not None:
return cached_response

range_header = request.META.get("HTTP_RANGE")

try:
response = get_embedded_file(zipped_path, zipped_filename, embedded_filepath)
response = get_embedded_file(
zipped_path, zipped_filename, embedded_filepath, range_header=range_header
)
except Exception:
if remote_baseurl:
return create_error_response(
Expand All @@ -288,9 +394,6 @@ def _zip_content_from_request(request): # noqa: C901
)
raise

# ensure the browser knows not to try byte-range requests, as we don't support them here
response.headers["Accept-Ranges"] = "none"

response.headers["Last-Modified"] = http_date(time.time())

patch_response_headers(response, cache_timeout=YEAR_IN_SECONDS)
Expand Down
3 changes: 2 additions & 1 deletion packages/hashi/src/H5P/H5PRunner.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,10 @@ export default class H5PRunner {
// and for logging xAPI statements about the content.
this.contentNamespace = CONTENT_ID;
const start = performance.now();
const largeFileUrlGenerator = filePath => `${this.zipcontentUrl}/${filePath}`;
// First load the full H5P file
// Store the zip locally for later reference
this.zip = new ZipFile(this.filepath);
this.zip = new ZipFile(this.filepath, { largeFileUrlGenerator });
// Recurse all the package dependencies
return this.recurseDependencies('h5p.json', true).then(() => {
// Once we have found all the dependencies, we call this
Expand Down
7 changes: 7 additions & 0 deletions packages/kolibri-zip/src/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// ZIP End of Central Directory Record signature
export const EOCD_SIGNATURE = 0x06054b50;
export const EOCD_SIZE = 22; // Minimum size of end of central directory record
export const MAX_COMMENT_SIZE = 65535; // Maximum ZIP comment size
export const LARGE_FILE_THRESHOLD = 500 * 1024; // 500KB
export const LOCAL_FILE_HEADER_SIGNATURE = 0x04034b50;
export const LOCAL_FILE_HEADER_FIXED_SIZE = 30;
Loading