Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File Uploads + media_server app #33

Merged
merged 2 commits into from
Apr 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,5 @@ dev.db

.vscode

# Media files (for uploads)
media/
1 change: 1 addition & 0 deletions olx_importer/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ class OLXImporterConfig(AppConfig):
"""
Configuration for the OLX Importer Django application.
"""

name = "olx_importer"
verbose_name = "OLX Importer"
106 changes: 67 additions & 39 deletions olx_importer/management/commands/load_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

Open Question: If the data model is extensible, how do we know whether a change
has really happened between what's currently stored/published for a particular
item and the new value we want to set? For Content that's easy, because we have
actual hashes of the data. But it's not clear how that would work for something
like an ComponentVersion. We'd have to have some kind of mechanism where every
item and the new value we want to set? For RawContent that's easy, because we
have actual hashes of the data. But it's not clear how that would work for
something like an ComponentVersion. We'd have to have some kind of mechanism where every
pp that wants to attach data gets to answer the question of "has anything
changed?" in order to decide if we really make a new ComponentVersion or not.
"""
Expand All @@ -25,22 +25,28 @@
import re
import xml.etree.ElementTree as ET

from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand
from django.db import transaction

from openedx_learning.core.publishing.models import LearningPackage, PublishLogEntry
from openedx_learning.core.components.models import (
Content, Component, ComponentVersion, ComponentVersionContent,
ComponentPublishLogEntry, PublishedComponent,
Component,
ComponentVersion,
ComponentVersionRawContent,
ComponentPublishLogEntry,
PublishedComponent,
RawContent,
TextContent,
)
from openedx_learning.lib.fields import create_hash_digest

SUPPORTED_TYPES = ['problem', 'video', 'html']
SUPPORTED_TYPES = ["problem", "video", "html"]
logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = 'Load sample Component data from course export'
help = "Load sample Component data from course export"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -61,20 +67,19 @@ def init_known_types(self):
# officially "text/javascript"
mimetypes.add_type("text/javascript", ".js")
mimetypes.add_type("text/javascript", ".mjs")


def add_arguments(self, parser):
parser.add_argument('course_data_path', type=pathlib.Path)
parser.add_argument('learning_package_identifier', type=str)
parser.add_argument("course_data_path", type=pathlib.Path)
parser.add_argument("learning_package_identifier", type=str)

def handle(self, course_data_path, learning_package_identifier, **options):
self.course_data_path = course_data_path
self.learning_package_identifier = learning_package_identifier
self.load_course_data(learning_package_identifier)

def get_course_title(self):
course_type_dir = self.course_data_path / 'course'
course_xml_file = next(course_type_dir.glob('*.xml'))
course_type_dir = self.course_data_path / "course"
course_xml_file = next(course_type_dir.glob("*.xml"))
course_root = ET.parse(course_xml_file).getroot()
return course_root.attrib.get("display_name", "Unknown Course")

Expand All @@ -87,9 +92,9 @@ def load_course_data(self, learning_package_identifier):
learning_package, _created = LearningPackage.objects.get_or_create(
identifier=learning_package_identifier,
defaults={
'title': title,
'created': now,
'updated': now,
"title": title,
"created": now,
"updated": now,
},
)
self.learning_package = learning_package
Expand All @@ -105,11 +110,13 @@ def load_course_data(self, learning_package_identifier):
self.import_block_type(block_type, now, publish_log_entry)

def create_content(self, static_local_path, now, component_version):
identifier = pathlib.Path('static') / static_local_path
identifier = pathlib.Path("static") / static_local_path
real_path = self.course_data_path / identifier
mime_type, _encoding = mimetypes.guess_type(identifier)
if mime_type is None:
logger.error(f" no mimetype found for {real_path}, defaulting to application/binary")
logger.error(
f" no mimetype found for {real_path}, defaulting to application/binary"
)
mime_type = "application/binary"

try:
Expand All @@ -120,20 +127,26 @@ def create_content(self, static_local_path, now, component_version):

hash_digest = create_hash_digest(data_bytes)

content, _created = Content.objects.get_or_create(
raw_content, created = RawContent.objects.get_or_create(
learning_package=self.learning_package,
mime_type=mime_type,
hash_digest=hash_digest,
defaults = dict(
data=data_bytes,
defaults=dict(
size=len(data_bytes),
created=now,
)
),
)
ComponentVersionContent.objects.get_or_create(
if created:
raw_content.file.save(
f"{raw_content.learning_package.uuid}/{hash_digest}",
ContentFile(data_bytes),
)

ComponentVersionRawContent.objects.get_or_create(
component_version=component_version,
content=content,
raw_content=raw_content,
identifier=identifier,
learner_downloadable=True,
)

def import_block_type(self, block_type, now, publish_log_entry):
Expand All @@ -146,35 +159,49 @@ def import_block_type(self, block_type, now, publish_log_entry):
static_files_regex = re.compile(r"""['"]\/static\/(.+?)["'\?]""")
block_data_path = self.course_data_path / block_type

for xml_file_path in block_data_path.glob('*.xml'):
for xml_file_path in block_data_path.glob("*.xml"):
components_found += 1
identifier = xml_file_path.stem

# Find or create the Component itself
component, _created = Component.objects.get_or_create(
learning_package=self.learning_package,
namespace='xblock.v1',
namespace="xblock.v1",
type=block_type,
identifier=identifier,
defaults = {
'created': now,
}
defaults={
"created": now,
},
)

# Create the Content entry for the raw data...
# Create the RawContent entry for the raw data...
data_bytes = xml_file_path.read_bytes()
hash_digest = create_hash_digest(data_bytes)
data_str = codecs.decode(data_bytes, 'utf-8')
content, _created = Content.objects.get_or_create(

raw_content, created = RawContent.objects.get_or_create(
learning_package=self.learning_package,
mime_type=f'application/vnd.openedx.xblock.v1.{block_type}+xml',
mime_type=f"application/vnd.openedx.xblock.v1.{block_type}+xml",
hash_digest=hash_digest,
defaults = dict(
data=data_bytes,
defaults=dict(
# text=data_str,
size=len(data_bytes),
created=now,
)
),
)
if created:
raw_content.file.save(
f"{raw_content.learning_package.uuid}/{hash_digest}",
ContentFile(data_bytes),
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, at least for smaller OLX content, why to we need to save the file to S3 if we have it in-DB via TextContent? Especially if it's learner_downloadable=False, is there any use case for reading from S3? From the docstring on the models it sounds like this is just for overall consistency, but I'm wondering if there's a use case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's mostly consistency, and I thought it might make export simpler. I also thought that there might be some edge cases where we want to later promote RawContent to TextContent after the fact, and have them exist in both places–like video transcripts. But mostly I just like the idea that everything is minimally represented as an opaque blob, and there can be multiple levels of progressive enhancement of that data over time, e.g. XBlockContent, Video, etc.


# Decode as utf-8-sig in order to strip any BOM from the data.
data_str = codecs.decode(data_bytes, "utf-8-sig")
TextContent.objects.create(
raw_content=raw_content,
text=data_str,
length=len(data_str),
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check if len(data_bytes) < MAX_TEXT_LENGTH before trying to save the copy of the OLX into TextContent ? (store it on S3 only, but still allow it) Or do we want to always throw an error for such large OLX?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point I want it to throw an error. Anything that big is probably going to cause issues down the road, and I'm honestly curious where we actually have that in the wild. I suspect in most cases, it's because something unexpected and weird is happening, like copy-pasting from a Word doc and bringing the images over as base64-encoded HTML.


# TODO: Get associated file contents, both with the static regex, as
# well as with XBlock-specific code that examines attributes in
# video and HTML tag definitions.
Expand All @@ -185,7 +212,7 @@ def import_block_type(self, block_type, now, publish_log_entry):
logger.error(f"Parse error for {xml_file_path}: {err}")
continue

display_name = block_root.attrib.get('display_name', "")
display_name = block_root.attrib.get("display_name", "")

# Create the ComponentVersion
component_version = ComponentVersion.objects.create(
Expand All @@ -195,10 +222,11 @@ def import_block_type(self, block_type, now, publish_log_entry):
created=now,
created_by=None,
)
ComponentVersionContent.objects.create(
ComponentVersionRawContent.objects.create(
component_version=component_version,
content=content,
identifier='source.xml',
raw_content=raw_content,
identifier="source.xml",
learner_downloadable=False,
)
static_files_found = static_files_regex.findall(data_str)
for static_local_path in static_files_found:
Expand Down
Empty file.
22 changes: 22 additions & 0 deletions openedx_learning/contrib/media_server/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from django.apps import AppConfig
from django.core.exceptions import ImproperlyConfigured
from django.conf import settings


class MediaServerConfig(AppConfig):
"""
Configuration for the Media Server application.
"""

name = "openedx_learning.contrib.media_server"
verbose_name = "Learning Core: Media Server"
default_auto_field = "django.db.models.BigAutoField"

def ready(self):
if not settings.DEBUG:
# Until we get proper security and support for running this app
# under a separate domain, just don't allow it to be run in
# production environments.
raise ImproperlyConfigured(
"The media_server app should only be run in DEBUG mode!"
)
16 changes: 16 additions & 0 deletions openedx_learning/contrib/media_server/readme.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Media Server App
================

The ``media_server`` app exists to serve media files that are ultimately backed by the RawContent model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites.

Motivation
----------

The ``components`` app stores large binary file data by calculating the hash and creating a django-storages backed file named after that hash. This is efficient from a storage point of view, because we don't store redundant copies for every version of a Component. There are at least two drawbacks:

* We have unintelligibly named files that are confusing for clients.
* Intra-file links between media files break. For instance, if we have a piece of HTML that makes a reference to a VTT file, that filename will have changed.

This app tries to bridge that gap by serving URLs that preserve the original file names and give the illusion that there is a seprate set of media files for every version of a Component, but does a lookup behind the scenes to serve the correct hash-based-file.

The big caveat on this is that Django is not really optimized to do this sort of asset serving. The most scalable approach is to have a CDN-backed solution where ``media_server`` serves the locations of files that are converted by worker code to serving the actual assets. (More details to follow when that part gets built out.)
16 changes: 16 additions & 0 deletions openedx_learning/contrib/media_server/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.urls import path

from .views import component_asset

urlpatterns = [
path(
(
"component_asset/"
"<str:learning_package_identifier>/"
"<str:component_identifier>/"
"<int:version_num>/"
"<path:asset_path>"
),
component_asset,
)
]
41 changes: 41 additions & 0 deletions openedx_learning/contrib/media_server/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path

from django.http import FileResponse
from django.http import Http404
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied

from openedx_learning.core.components.api import get_component_version_content


def component_asset(
request, learning_package_identifier, component_identifier, version_num, asset_path
):
"""
Serve the ComponentVersion asset data.

This function maps from a logical URL with Component and verison data like:
media_server/component_asset/course101/finalexam-problem14/1/static/images/fig3.png
To the actual data file as stored in file/object storage, which looks like:
media/055499fd-f670-451a-9727-501ea9dfbf5b/a9528d66739a297aa0cd17106b0bc0f7515b8e78

TODO:
* ETag support
* Range queries
* Serving from a different domain than the rest of the service
"""
try:
cvc = get_component_version_content(
learning_package_identifier, component_identifier, version_num, asset_path
)
except ObjectDoesNotExist:
raise Http404("File not found")

if not cvc.learner_downloadable and not (
request.user and request.user.is_superuser
):
raise PermissionDenied("This file is not publicly downloadable.")

response = FileResponse(cvc.raw_content.file, filename=Path(asset_path).name)
response["Content-Type"] = cvc.raw_content.mime_type

return response
9 changes: 9 additions & 0 deletions openedx_learning/contrib/readme.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Contrib Package
===============

The ``contrib`` package holds Django apps that *could* be implemented in separate repos, but are bundled here because it's more convenient to do so.

Guidelines
----------

Nothing from ``lib`` or ``core`` should *ever* import from ``contrib``.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The isolated apps linter could enforce this for you if you wanted down the road :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do have some primitive import linting set up here, though I need to actually stop being lazy and hook up the CI to run it properly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh awesome :)

Loading