-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
File Uploads + media_server app #33
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,3 +65,5 @@ dev.db | |
|
||
.vscode | ||
|
||
# Media files (for uploads) | ||
media/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,9 +11,9 @@ | |
|
||
Open Question: If the data model is extensible, how do we know whether a change | ||
has really happened between what's currently stored/published for a particular | ||
item and the new value we want to set? For Content that's easy, because we have | ||
actual hashes of the data. But it's not clear how that would work for something | ||
like an ComponentVersion. We'd have to have some kind of mechanism where every | ||
item and the new value we want to set? For RawContent that's easy, because we | ||
have actual hashes of the data. But it's not clear how that would work for | ||
something like an ComponentVersion. We'd have to have some kind of mechanism where every | ||
pp that wants to attach data gets to answer the question of "has anything | ||
changed?" in order to decide if we really make a new ComponentVersion or not. | ||
""" | ||
|
@@ -25,22 +25,28 @@ | |
import re | ||
import xml.etree.ElementTree as ET | ||
|
||
from django.core.files.base import ContentFile | ||
from django.core.management.base import BaseCommand | ||
from django.db import transaction | ||
|
||
from openedx_learning.core.publishing.models import LearningPackage, PublishLogEntry | ||
from openedx_learning.core.components.models import ( | ||
Content, Component, ComponentVersion, ComponentVersionContent, | ||
ComponentPublishLogEntry, PublishedComponent, | ||
Component, | ||
ComponentVersion, | ||
ComponentVersionRawContent, | ||
ComponentPublishLogEntry, | ||
PublishedComponent, | ||
RawContent, | ||
TextContent, | ||
) | ||
from openedx_learning.lib.fields import create_hash_digest | ||
|
||
SUPPORTED_TYPES = ['problem', 'video', 'html'] | ||
SUPPORTED_TYPES = ["problem", "video", "html"] | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = 'Load sample Component data from course export' | ||
help = "Load sample Component data from course export" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
@@ -61,20 +67,19 @@ def init_known_types(self): | |
# officially "text/javascript" | ||
mimetypes.add_type("text/javascript", ".js") | ||
mimetypes.add_type("text/javascript", ".mjs") | ||
|
||
|
||
def add_arguments(self, parser): | ||
parser.add_argument('course_data_path', type=pathlib.Path) | ||
parser.add_argument('learning_package_identifier', type=str) | ||
parser.add_argument("course_data_path", type=pathlib.Path) | ||
parser.add_argument("learning_package_identifier", type=str) | ||
|
||
def handle(self, course_data_path, learning_package_identifier, **options): | ||
self.course_data_path = course_data_path | ||
self.learning_package_identifier = learning_package_identifier | ||
self.load_course_data(learning_package_identifier) | ||
|
||
def get_course_title(self): | ||
course_type_dir = self.course_data_path / 'course' | ||
course_xml_file = next(course_type_dir.glob('*.xml')) | ||
course_type_dir = self.course_data_path / "course" | ||
course_xml_file = next(course_type_dir.glob("*.xml")) | ||
course_root = ET.parse(course_xml_file).getroot() | ||
return course_root.attrib.get("display_name", "Unknown Course") | ||
|
||
|
@@ -87,9 +92,9 @@ def load_course_data(self, learning_package_identifier): | |
learning_package, _created = LearningPackage.objects.get_or_create( | ||
identifier=learning_package_identifier, | ||
defaults={ | ||
'title': title, | ||
'created': now, | ||
'updated': now, | ||
"title": title, | ||
"created": now, | ||
"updated": now, | ||
}, | ||
) | ||
self.learning_package = learning_package | ||
|
@@ -105,11 +110,13 @@ def load_course_data(self, learning_package_identifier): | |
self.import_block_type(block_type, now, publish_log_entry) | ||
|
||
def create_content(self, static_local_path, now, component_version): | ||
identifier = pathlib.Path('static') / static_local_path | ||
identifier = pathlib.Path("static") / static_local_path | ||
real_path = self.course_data_path / identifier | ||
mime_type, _encoding = mimetypes.guess_type(identifier) | ||
if mime_type is None: | ||
logger.error(f" no mimetype found for {real_path}, defaulting to application/binary") | ||
logger.error( | ||
f" no mimetype found for {real_path}, defaulting to application/binary" | ||
) | ||
mime_type = "application/binary" | ||
|
||
try: | ||
|
@@ -120,20 +127,26 @@ def create_content(self, static_local_path, now, component_version): | |
|
||
hash_digest = create_hash_digest(data_bytes) | ||
|
||
content, _created = Content.objects.get_or_create( | ||
raw_content, created = RawContent.objects.get_or_create( | ||
learning_package=self.learning_package, | ||
mime_type=mime_type, | ||
hash_digest=hash_digest, | ||
defaults = dict( | ||
data=data_bytes, | ||
defaults=dict( | ||
size=len(data_bytes), | ||
created=now, | ||
) | ||
), | ||
) | ||
ComponentVersionContent.objects.get_or_create( | ||
if created: | ||
raw_content.file.save( | ||
f"{raw_content.learning_package.uuid}/{hash_digest}", | ||
ContentFile(data_bytes), | ||
) | ||
|
||
ComponentVersionRawContent.objects.get_or_create( | ||
component_version=component_version, | ||
content=content, | ||
raw_content=raw_content, | ||
identifier=identifier, | ||
learner_downloadable=True, | ||
) | ||
|
||
def import_block_type(self, block_type, now, publish_log_entry): | ||
|
@@ -146,35 +159,49 @@ def import_block_type(self, block_type, now, publish_log_entry): | |
static_files_regex = re.compile(r"""['"]\/static\/(.+?)["'\?]""") | ||
block_data_path = self.course_data_path / block_type | ||
|
||
for xml_file_path in block_data_path.glob('*.xml'): | ||
for xml_file_path in block_data_path.glob("*.xml"): | ||
components_found += 1 | ||
identifier = xml_file_path.stem | ||
|
||
# Find or create the Component itself | ||
component, _created = Component.objects.get_or_create( | ||
learning_package=self.learning_package, | ||
namespace='xblock.v1', | ||
namespace="xblock.v1", | ||
type=block_type, | ||
identifier=identifier, | ||
defaults = { | ||
'created': now, | ||
} | ||
defaults={ | ||
"created": now, | ||
}, | ||
) | ||
|
||
# Create the Content entry for the raw data... | ||
# Create the RawContent entry for the raw data... | ||
data_bytes = xml_file_path.read_bytes() | ||
hash_digest = create_hash_digest(data_bytes) | ||
data_str = codecs.decode(data_bytes, 'utf-8') | ||
content, _created = Content.objects.get_or_create( | ||
|
||
raw_content, created = RawContent.objects.get_or_create( | ||
learning_package=self.learning_package, | ||
mime_type=f'application/vnd.openedx.xblock.v1.{block_type}+xml', | ||
mime_type=f"application/vnd.openedx.xblock.v1.{block_type}+xml", | ||
hash_digest=hash_digest, | ||
defaults = dict( | ||
data=data_bytes, | ||
defaults=dict( | ||
# text=data_str, | ||
size=len(data_bytes), | ||
created=now, | ||
) | ||
), | ||
) | ||
if created: | ||
raw_content.file.save( | ||
f"{raw_content.learning_package.uuid}/{hash_digest}", | ||
ContentFile(data_bytes), | ||
) | ||
|
||
# Decode as utf-8-sig in order to strip any BOM from the data. | ||
data_str = codecs.decode(data_bytes, "utf-8-sig") | ||
TextContent.objects.create( | ||
raw_content=raw_content, | ||
text=data_str, | ||
length=len(data_str), | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we check if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point I want it to throw an error. Anything that big is probably going to cause issues down the road, and I'm honestly curious where we actually have that in the wild. I suspect in most cases, it's because something unexpected and weird is happening, like copy-pasting from a Word doc and bringing the images over as base64-encoded HTML. |
||
|
||
# TODO: Get associated file contents, both with the static regex, as | ||
# well as with XBlock-specific code that examines attributes in | ||
# video and HTML tag definitions. | ||
|
@@ -185,7 +212,7 @@ def import_block_type(self, block_type, now, publish_log_entry): | |
logger.error(f"Parse error for {xml_file_path}: {err}") | ||
continue | ||
|
||
display_name = block_root.attrib.get('display_name', "") | ||
display_name = block_root.attrib.get("display_name", "") | ||
|
||
# Create the ComponentVersion | ||
component_version = ComponentVersion.objects.create( | ||
|
@@ -195,10 +222,11 @@ def import_block_type(self, block_type, now, publish_log_entry): | |
created=now, | ||
created_by=None, | ||
) | ||
ComponentVersionContent.objects.create( | ||
ComponentVersionRawContent.objects.create( | ||
component_version=component_version, | ||
content=content, | ||
identifier='source.xml', | ||
raw_content=raw_content, | ||
identifier="source.xml", | ||
learner_downloadable=False, | ||
) | ||
static_files_found = static_files_regex.findall(data_str) | ||
for static_local_path in static_files_found: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from django.apps import AppConfig | ||
from django.core.exceptions import ImproperlyConfigured | ||
from django.conf import settings | ||
|
||
|
||
class MediaServerConfig(AppConfig): | ||
""" | ||
Configuration for the Media Server application. | ||
""" | ||
|
||
name = "openedx_learning.contrib.media_server" | ||
verbose_name = "Learning Core: Media Server" | ||
default_auto_field = "django.db.models.BigAutoField" | ||
|
||
def ready(self): | ||
if not settings.DEBUG: | ||
# Until we get proper security and support for running this app | ||
# under a separate domain, just don't allow it to be run in | ||
# production environments. | ||
raise ImproperlyConfigured( | ||
"The media_server app should only be run in DEBUG mode!" | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
Media Server App | ||
================ | ||
|
||
The ``media_server`` app exists to serve media files that are ultimately backed by the RawContent model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites. | ||
|
||
Motivation | ||
---------- | ||
|
||
The ``components`` app stores large binary file data by calculating the hash and creating a django-storages backed file named after that hash. This is efficient from a storage point of view, because we don't store redundant copies for every version of a Component. There are at least two drawbacks: | ||
|
||
* We have unintelligibly named files that are confusing for clients. | ||
* Intra-file links between media files break. For instance, if we have a piece of HTML that makes a reference to a VTT file, that filename will have changed. | ||
|
||
This app tries to bridge that gap by serving URLs that preserve the original file names and give the illusion that there is a seprate set of media files for every version of a Component, but does a lookup behind the scenes to serve the correct hash-based-file. | ||
|
||
The big caveat on this is that Django is not really optimized to do this sort of asset serving. The most scalable approach is to have a CDN-backed solution where ``media_server`` serves the locations of files that are converted by worker code to serving the actual assets. (More details to follow when that part gets built out.) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from django.urls import path | ||
|
||
from .views import component_asset | ||
|
||
urlpatterns = [ | ||
path( | ||
( | ||
"component_asset/" | ||
"<str:learning_package_identifier>/" | ||
"<str:component_identifier>/" | ||
"<int:version_num>/" | ||
"<path:asset_path>" | ||
), | ||
component_asset, | ||
) | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from pathlib import Path | ||
|
||
from django.http import FileResponse | ||
from django.http import Http404 | ||
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied | ||
|
||
from openedx_learning.core.components.api import get_component_version_content | ||
|
||
|
||
def component_asset( | ||
request, learning_package_identifier, component_identifier, version_num, asset_path | ||
): | ||
""" | ||
Serve the ComponentVersion asset data. | ||
|
||
This function maps from a logical URL with Component and verison data like: | ||
media_server/component_asset/course101/finalexam-problem14/1/static/images/fig3.png | ||
To the actual data file as stored in file/object storage, which looks like: | ||
media/055499fd-f670-451a-9727-501ea9dfbf5b/a9528d66739a297aa0cd17106b0bc0f7515b8e78 | ||
|
||
TODO: | ||
* ETag support | ||
* Range queries | ||
* Serving from a different domain than the rest of the service | ||
""" | ||
try: | ||
cvc = get_component_version_content( | ||
learning_package_identifier, component_identifier, version_num, asset_path | ||
) | ||
except ObjectDoesNotExist: | ||
raise Http404("File not found") | ||
|
||
if not cvc.learner_downloadable and not ( | ||
request.user and request.user.is_superuser | ||
): | ||
raise PermissionDenied("This file is not publicly downloadable.") | ||
|
||
response = FileResponse(cvc.raw_content.file, filename=Path(asset_path).name) | ||
response["Content-Type"] = cvc.raw_content.mime_type | ||
|
||
return response |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Contrib Package | ||
=============== | ||
|
||
The ``contrib`` package holds Django apps that *could* be implemented in separate repos, but are bundled here because it's more convenient to do so. | ||
|
||
Guidelines | ||
---------- | ||
|
||
Nothing from ``lib`` or ``core`` should *ever* import from ``contrib``. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The isolated apps linter could enforce this for you if you wanted down the road :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do have some primitive import linting set up here, though I need to actually stop being lazy and hook up the CI to run it properly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh awesome :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, at least for smaller OLX content, why to we need to save the file to S3 if we have it in-DB via
TextContent
? Especially if it'slearner_downloadable=False
, is there any use case for reading from S3? From the docstring on the models it sounds like this is just for overall consistency, but I'm wondering if there's a use case?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's mostly consistency, and I thought it might make export simpler. I also thought that there might be some edge cases where we want to later promote RawContent to TextContent after the fact, and have them exist in both places–like video transcripts. But mostly I just like the idea that everything is minimally represented as an opaque blob, and there can be multiple levels of progressive enhancement of that data over time, e.g. XBlockContent, Video, etc.