diff --git a/openedx_learning/__init__.py b/openedx_learning/__init__.py index 73a0541a..8b9f3bfc 100644 --- a/openedx_learning/__init__.py +++ b/openedx_learning/__init__.py @@ -1,4 +1,4 @@ """ Open edX Learning ("Learning Core"). """ -__version__ = "0.3.7" +__version__ = "0.4.0" diff --git a/openedx_learning/core/components/admin.py b/openedx_learning/core/components/admin.py index 5710303a..c5fb9b16 100644 --- a/openedx_learning/core/components/admin.py +++ b/openedx_learning/core/components/admin.py @@ -158,7 +158,7 @@ def content_preview(cvc_obj: ComponentVersionRawContent) -> SafeText: """ raw_content_obj = cvc_obj.raw_content - if raw_content_obj.mime_type.startswith("image/"): + if raw_content_obj.media_type.type == "image": return format_html( '', # TODO: configure with settings value: diff --git a/openedx_learning/core/components/api.py b/openedx_learning/core/components/api.py index 61e7e574..4275c5d1 100644 --- a/openedx_learning/core/components/api.py +++ b/openedx_learning/core/components/api.py @@ -117,6 +117,7 @@ def get_component_version_content( """ return ComponentVersionRawContent.objects.select_related( "raw_content", + "raw_content__media_type", "component_version", "component_version__component", "component_version__component__learning_package", diff --git a/openedx_learning/core/components/migrations/0001_initial.py b/openedx_learning/core/components/migrations/0001_initial.py index 0ca82b95..bbe0e15d 100644 --- a/openedx_learning/core/components/migrations/0001_initial.py +++ b/openedx_learning/core/components/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.19 on 2023-06-15 14:43 +# Generated by Django 3.2.23 on 2023-12-04 00:41 import uuid @@ -13,7 +13,7 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ('oel_publishing', '0001_initial'), + ('oel_publishing', '0002_alter_fk_on_delete'), ('oel_contents', '0001_initial'), ] diff --git a/openedx_learning/core/contents/admin.py b/openedx_learning/core/contents/admin.py index 0ce7ed61..8409fe90 100644 --- a/openedx_learning/core/contents/admin.py +++ b/openedx_learning/core/contents/admin.py @@ -18,14 +18,14 @@ class RawContentAdmin(ReadOnlyModelAdmin): "hash_digest", "file_link", "learning_package", - "mime_type", + "media_type", "size", "created", ] fields = [ "learning_package", "hash_digest", - "mime_type", + "media_type", "size", "created", "file_link", @@ -34,13 +34,13 @@ class RawContentAdmin(ReadOnlyModelAdmin): readonly_fields = [ "learning_package", "hash_digest", - "mime_type", + "media_type", "size", "created", "file_link", "text_preview", ] - list_filter = ("mime_type", "learning_package") + list_filter = ("media_type", "learning_package") search_fields = ("hash_digest",) def file_link(self, raw_content): diff --git a/openedx_learning/core/contents/api.py b/openedx_learning/core/contents/api.py index fd4e069a..c29255b2 100644 --- a/openedx_learning/core/contents/api.py +++ b/openedx_learning/core/contents/api.py @@ -12,9 +12,10 @@ from django.core.files.base import ContentFile from django.db.transaction import atomic +from openedx_learning.lib.cache import lru_cache from openedx_learning.lib.fields import create_hash_digest -from .models import RawContent, TextContent +from .models import MediaType, RawContent, TextContent def create_raw_content( @@ -28,9 +29,10 @@ def create_raw_content( Create a new RawContent instance and persist it to storage. """ hash_digest = hash_digest or create_hash_digest(data_bytes) + raw_content = RawContent.objects.create( learning_package_id=learning_package_id, - mime_type=mime_type, + media_type_id=get_media_type_id(mime_type), hash_digest=hash_digest, size=len(data_bytes), created=created, @@ -54,6 +56,39 @@ def create_text_from_raw_content(raw_content: RawContent, encoding="utf-8-sig") ) +@lru_cache(maxsize=128) +def get_media_type_id(mime_type: str) -> int: + """ + Return the MediaType.id for the desired mime_type string. + + If it is not found in the database, a new entry will be created for it. This + lazy-writing means that MediaType entry IDs will *not* be the same across + different server instances, and apps should not assume that will be the + case. Even if we were to preload a bunch of common ones, we can't anticipate + the different XBlocks that will be installed in different server instances, + each of which will use their own MediaType. + + This will typically only be called when create_raw_content is calling it to + lookup the media_type_id it should use for a new RawContent. If you already + have a RawContent instance, it makes much more sense to access its + media_type relation. + """ + if "+" in mime_type: + base, suffix = mime_type.split("+") + else: + base = mime_type + suffix = "" + + main_type, sub_type = base.split("/") + mt, _created = MediaType.objects.get_or_create( + type=main_type, + sub_type=sub_type, + suffix=suffix, + ) + + return mt.id + + def get_or_create_raw_content( learning_package_id: int, data_bytes: bytes, diff --git a/openedx_learning/core/contents/migrations/0001_initial.py b/openedx_learning/core/contents/migrations/0001_initial.py index 20b6b01c..7dac9a3f 100644 --- a/openedx_learning/core/contents/migrations/0001_initial.py +++ b/openedx_learning/core/contents/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.19 on 2023-06-15 14:43 +# Generated by Django 3.2.23 on 2023-12-04 00:41 import django.core.validators import django.db.models.deletion @@ -13,20 +13,29 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ('oel_publishing', '0001_initial'), + ('oel_publishing', '0002_alter_fk_on_delete'), ] operations = [ + migrations.CreateModel( + name='MediaType', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), + ('sub_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), + ('suffix', openedx_learning.lib.fields.MultiCollationCharField(blank=True, db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=127)), + ], + ), migrations.CreateModel( name='RawContent', fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('hash_digest', models.CharField(editable=False, max_length=40)), - ('mime_type', openedx_learning.lib.fields.MultiCollationCharField(db_collations={'mysql': 'utf8mb4_unicode_ci', 'sqlite': 'NOCASE'}, max_length=255)), ('size', models.PositiveBigIntegerField(validators=[django.core.validators.MaxValueValidator(50000000)])), ('created', models.DateTimeField(validators=[openedx_learning.lib.validators.validate_utc_datetime])), ('file', models.FileField(null=True, upload_to='')), ('learning_package', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='oel_publishing.learningpackage')), + ('media_type', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='oel_contents.mediatype')), ], options={ 'verbose_name': 'Raw Content', @@ -41,10 +50,13 @@ class Migration(migrations.Migration): ('length', models.PositiveIntegerField()), ], ), - # Call out to custom code here to change row format for TextContent + migrations.AddConstraint( + model_name='mediatype', + constraint=models.UniqueConstraint(fields=('type', 'sub_type', 'suffix'), name='oel_contents_uniq_t_st_sfx'), + ), migrations.AddIndex( model_name='rawcontent', - index=models.Index(fields=['learning_package', 'mime_type'], name='oel_content_idx_lp_mime_type'), + index=models.Index(fields=['learning_package', 'media_type'], name='oel_content_idx_lp_media_type'), ), migrations.AddIndex( model_name='rawcontent', @@ -56,6 +68,6 @@ class Migration(migrations.Migration): ), migrations.AddConstraint( model_name='rawcontent', - constraint=models.UniqueConstraint(fields=('learning_package', 'mime_type', 'hash_digest'), name='oel_content_uniq_lc_mime_type_hash_digest'), + constraint=models.UniqueConstraint(fields=('learning_package', 'media_type', 'hash_digest'), name='oel_content_uniq_lc_media_type_hash_digest'), ), ] diff --git a/openedx_learning/core/contents/models.py b/openedx_learning/core/contents/models.py index 2da7ebfe..e7de7375 100644 --- a/openedx_learning/core/contents/models.py +++ b/openedx_learning/core/contents/models.py @@ -3,6 +3,8 @@ the simplest building blocks to store data with. They need to be composed into more intelligent data models to be useful. """ +from functools import cached_property + from django.conf import settings from django.core.files.storage import default_storage from django.core.validators import MaxValueValidator @@ -18,6 +20,81 @@ from ..publishing.models import LearningPackage +class MediaType(models.Model): + """ + Stores Media types for use by RawContent models. + + This is the same as MIME types (the IANA renamed MIME Types to Media Types). + We don't pre-populate this table, so APIs that add RawContent must ensure + that the desired Media Type exists. + + Media types are written as {type}/{sub_type}+{suffix}, where suffixes are + seldom used. + + * application/json + * text/css + * image/svg+xml + * application/vnd.openedx.xblock.v1.problem+xml + + We have this as a separate model (instead of a field on RawContent) because: + + 1. We can save a lot on storage and indexing for RawContent if we're just + storing foreign key references there, rather than the entire content + string to be indexed. This is especially relevant for our (long) custom + types like "application/vnd.openedx.xblock.v1.problem+xml". + 2. These values can occasionally change. For instance, "text/javascript" vs. + "application/javascript". Also, we will be using a fair number of "vnd." + style of custom content types, and we may want the flexibility of + changing that without having to worry about migrating millions of rows of + RawContent. + """ + # We're going to have many foreign key references from RawContent into this + # model, and we don't need to store those as 8-byte BigAutoField, as is the + # default for this app. It's likely that a SmallAutoField would work, but I + # can just barely imagine using more than 32K Media types if we have a bunch + # of custom "vnd." entries, or start tracking suffixes and parameters. Which + # is how we end up at the 4-byte AutoField. + id = models.AutoField(primary_key=True) + + # Media types are denoted as {type}/{sub_type}+{suffix}. We currently do not + # support parameters. + + # Media type, e.g. "application", "text", "image". Per RFC 4288, this can be + # at most 127 chars long and is case insensitive. In practice, it's almost + # always written in lowercase. + type = case_insensitive_char_field(max_length=127, blank=False, null=False) + + # Media sub-type, e.g. "json", "css", "png". Per RFC 4288, this can be at + # most 127 chars long and is case insensitive. In practice, it's almost + # always written in lowercase. + sub_type = case_insensitive_char_field(max_length=127, blank=False, null=False) + + # Suffix, usually just "xml" (e.g. "image/svg+xml"). Usually blank. I + # couldn't find an RFC description of the length limit, and 127 is probably + # excessive. But this table should be small enough where it doesn't really + # matter. + suffix = case_insensitive_char_field(max_length=127, blank=True, null=False) + + class Meta: + constraints = [ + # Make sure all (type + sub_type + suffix) combinations are unique. + models.UniqueConstraint( + fields=[ + "type", + "sub_type", + "suffix", + ], + name="oel_contents_uniq_t_st_sfx", + ), + ] + + def __str__(self): + base = f"{self.type}/{self.sub_type}" + if self.suffix: + return f"{base}+{self.suffix}" + return base + + class RawContent(models.Model): # type: ignore[django-manager-missing] """ This is the most basic piece of raw content data, with no version metadata. @@ -77,15 +154,8 @@ class RawContent(models.Model): # type: ignore[django-manager-missing] # openedx.lib.fields module. hash_digest = hash_field() - # MIME type, such as "text/html", "image/png", etc. Per RFC 4288, MIME type - # and sub-type may each be 127 chars, making a max of 255 (including the "/" - # in between). Also, while MIME types are almost always written in lowercase - # as a matter of convention, by spec they are NOT case sensitive. - # - # DO NOT STORE parameters here, e.g. "charset=". We can make a new field if - # that becomes necessary. If we do decide to store parameters and values - # later, note that those *may be* case sensitive. - mime_type = case_insensitive_char_field(max_length=255, blank=False) + # What is the Media type (a.k.a. MIME type) of this data? + media_type = models.ForeignKey(MediaType, on_delete=models.PROTECT) # This is the size of the raw data file in bytes. This can be different than # the character length, since UTF-8 encoding can use anywhere between 1-4 @@ -107,6 +177,10 @@ class RawContent(models.Model): # type: ignore[django-manager-missing] storage=settings.OPENEDX_LEARNING.get("STORAGE", default_storage), # type: ignore ) + @cached_property + def mime_type(self): + return str(self.media_type) + class Meta: constraints = [ # Make sure we don't store duplicates of this raw data within the @@ -114,21 +188,21 @@ class Meta: models.UniqueConstraint( fields=[ "learning_package", - "mime_type", + "media_type", "hash_digest", ], - name="oel_content_uniq_lc_mime_type_hash_digest", + name="oel_content_uniq_lc_media_type_hash_digest", ), ] indexes = [ - # LearningPackage MIME type Index: + # LearningPackage Media type Index: # * Break down Content counts by type/subtype with in a # LearningPackage. # * Find all the Content in a LearningPackage that matches a # certain MIME type (e.g. "image/png", "application/pdf". models.Index( - fields=["learning_package", "mime_type"], - name="oel_content_idx_lp_mime_type", + fields=["learning_package", "media_type"], + name="oel_content_idx_lp_media_type", ), # LearningPackage (reverse) Size Index: # * Find largest Content in a LearningPackage. diff --git a/openedx_learning/lib/cache.py b/openedx_learning/lib/cache.py new file mode 100644 index 00000000..098dbab0 --- /dev/null +++ b/openedx_learning/lib/cache.py @@ -0,0 +1,34 @@ +""" +Test-friendly helpers for caching. + +LRU caching can be especially helpful for Learning Core data models because so +much of it is immutable, e.g. Media Type lookups. But while these data models +are immutable within a given site, we also want to be able to track and clear +these caches across test runs. Later on, we may also want to inspect them to +make sure they are not growing overly large. +""" +import functools + +# List of functions that have our +_lru_cached_fns = [] + + +def lru_cache(*args, **kwargs): + """ + Thin wrapper over functools.lru_cache that lets us clear all caches later. + """ + def decorator(fn): + wrapped_fn = functools.lru_cache(*args, **kwargs)(fn) + _lru_cached_fns.append(wrapped_fn) + return wrapped_fn + return decorator + + +def clear_lru_caches(): + """ + Clear all LRU caches that use our lru_cache decorator. + + Useful for tests. + """ + for fn in _lru_cached_fns: + fn.cache_clear() diff --git a/openedx_learning/lib/fields.py b/openedx_learning/lib/fields.py index b239eb8c..900e590f 100644 --- a/openedx_learning/lib/fields.py +++ b/openedx_learning/lib/fields.py @@ -121,6 +121,16 @@ def hash_field() -> models.CharField: Use the create_hash_digest function to generate data suitable for this field. + + There are a couple of ways that we could have stored this more efficiently, + but we don't at this time: + + 1. A BinaryField would be the most space efficient, but Django doesn't + support indexing a BinaryField in a MySQL database. + 2. We could make the field case-sensitive and run it through a URL-safe + base64 encoding. But the amount of space this saves vs. the complexity + didn't seem worthwhile, particularly the possibility of case-sensitivity + related bugs. """ return models.CharField( max_length=40, diff --git a/openedx_learning/lib/test_utils.py b/openedx_learning/lib/test_utils.py new file mode 100644 index 00000000..5c947311 --- /dev/null +++ b/openedx_learning/lib/test_utils.py @@ -0,0 +1,22 @@ +""" +Test utilities for Learning Core. + +The only thing here now is a TestCase class that knows how to clean up the +caching used by the cache module in this package. +""" +import django.test + +from .cache import clear_lru_caches + + +class TestCase(django.test.TestCase): + """ + Subclass of Django's TestCase that knows how to reset caching we might use. + """ + def setUp(self) -> None: + clear_lru_caches() + super().setUp() + + def tearDown(self) -> None: + clear_lru_caches() + super().tearDown() diff --git a/openedx_tagging/core/tagging/import_export/api.py b/openedx_tagging/core/tagging/import_export/api.py index bdc0992a..7d6ae5e7 100644 --- a/openedx_tagging/core/tagging/import_export/api.py +++ b/openedx_tagging/core/tagging/import_export/api.py @@ -115,7 +115,7 @@ def import_tags( tag_import_plan.execute(task) task.end_success() return True - except Exception as exception: # pylint: disable=broad-exception-caught + except Exception as exception: # Log any exception task.log_exception(exception) return False diff --git a/tests/openedx_learning/core/components/test_models.py b/tests/openedx_learning/core/components/test_models.py index 54af3de1..7675f705 100644 --- a/tests/openedx_learning/core/components/test_models.py +++ b/tests/openedx_learning/core/components/test_models.py @@ -3,10 +3,9 @@ """ from datetime import datetime, timezone -from django.test.testcases import TestCase - from openedx_learning.core.components.api import create_component_and_version from openedx_learning.core.publishing.api import LearningPackage, create_learning_package, publish_all_drafts +from openedx_learning.lib.test_utils import TestCase class TestModelVersioningQueries(TestCase): diff --git a/tests/openedx_learning/core/contents/__init__.py b/tests/openedx_learning/core/contents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/openedx_learning/core/contents/test_media_types.py b/tests/openedx_learning/core/contents/test_media_types.py new file mode 100644 index 00000000..c880a008 --- /dev/null +++ b/tests/openedx_learning/core/contents/test_media_types.py @@ -0,0 +1,34 @@ +""" +A few tests to make sure our MediaType lookups are working as expected. +""" +from openedx_learning.core.contents import api as contents_api +from openedx_learning.lib.test_utils import TestCase + + +class MediaTypeCachingTestCase(TestCase): + """ + Test that our LRU caching is working as expected. + """ + def test_media_query_caching(self): + """Test MediaType queries auto-create and caching.""" + assert contents_api.get_media_type_id.cache_info().currsize == 0 + + mime_type_str = "application/vnd.openedx.xblock.v1.problem+xml" + media_type_id = contents_api.get_media_type_id(mime_type_str) + + # Now it should be loaded in the cache + assert contents_api.get_media_type_id.cache_info().currsize == 1 + + # Second call pulls from cache instead of the database + with self.assertNumQueries(0): + # Should also return the same thing it did last time. + assert media_type_id == contents_api.get_media_type_id(mime_type_str) + + def test_media_query_caching_reset(self): + """ + Test that setUp/tearDown reset the get_media_type_id LRU cache. + + This test method's *must* execute after test_media_query_caching to be + meaningful (they execute in string sort order). + """ + assert contents_api.get_media_type_id.cache_info().currsize == 0 diff --git a/tests/openedx_learning/core/publishing/test_api.py b/tests/openedx_learning/core/publishing/test_api.py index 9327a100..a258f5b0 100644 --- a/tests/openedx_learning/core/publishing/test_api.py +++ b/tests/openedx_learning/core/publishing/test_api.py @@ -6,9 +6,9 @@ import pytest from django.core.exceptions import ValidationError -from django.test import TestCase from openedx_learning.core.publishing import api as publishing_api +from openedx_learning.lib.test_utils import TestCase class CreateLearningPackageTestCase(TestCase):