Feature improve plugginability (#7)

* Models now generated from entrypoints. Language codes are more customizable using a dict map. * Updated changelog for v0.2.1 * Improved doc for plugins * Trim unused code * Bump version number * Added python3.11 to classifiers * Fixed link
Crivella · Sep 20, 2023 · 2d19366 · 2d19366
1 parent b28a143
commit 2d19366
Show file tree

Hide file tree

Showing 12 changed files with 327 additions and 125 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,24 @@
 
 List of changes between versions
 
+## 0.2.1
+
+Plugins can now be used to also add models to the database via the following entrypoints:
+
+- `ocr_translate.box_data`
+- `ocr_translate.ocr_data`
+- `ocr_translate.tsl_data`
+
+The entrypoint should point to a `dict` with the info to create the model.
+See [init of plugins](ocr_translate/plugins/__init__.py) for example (care that box/ocr/tsl may need to define different keys).
+
+Information about model-specific language codes is now encoded into an `iso1_map` field of the model.
+
+- Before new models with custom codes in a plugin would require to also edit the main repo and adding a new column to languages in the database.
+- Now the plugin can set the `lang_code` to whatever is closest to the model codes, and overwrite what does not match using `iso1_map`, by mapping iso-639-1 codes to the model-specific ones.
+
+Tag only without release as the changes still requires plugins to be baked in with the installer (they cannot be dynamically added without an hack-ish solution).
+
 ## 0.2.0
 
 Restructured the code to make it pluginable.

diff --git a/README.md b/README.md
@@ -152,6 +152,8 @@ before installing the python package.
 
 ## Writing plugins for the server
 
+Since version 0.2.1 plugins can now be used to also add models to the database via the following entrypoints. See [CHANGELOG](/CHANGELOG.md) for more details.
+
 Since version 0.2.0 the server has been made pluginable.
 You can write a plugin for a model/web-service that has not yet been implemented, by subclassing the following models
 

diff --git a/build.sh b/build.sh
@@ -7,7 +7,6 @@ pyinstaller \
     --name run_server \
     --icon icon.ico \
     --add-data "ocr_translate/ocr_tsl/languages.json:ocr_translate/ocr_tsl" \
-    --add-data "ocr_translate/ocr_tsl/models.json:ocr_translate/ocr_tsl" \
     --collect-all djang-ocr_translate \
     --collect-all torch \
     --collect-all torchvision \

diff --git a/ocr_translate/__init__.py b/ocr_translate/__init__.py
@@ -18,4 +18,4 @@
 ###################################################################################
 """OCR and translation of images."""
 
-__version__ = '0.2.0'
+__version__ = '0.2.1'
diff --git a/...translate/migrations/0009_remove_language_easyocr_remove_language_facebookm2m_and_more.py b/...translate/migrations/0009_remove_language_easyocr_remove_language_facebookm2m_and_more.py
@@ -0,0 +1,40 @@
+# Generated by Django 4.2.4 on 2023-09-20 11:37
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('ocr_translate', '0008_ocrboxmodel_entrypoint_and_more'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='language',
+            name='easyocr',
+        ),
+        migrations.RemoveField(
+            model_name='language',
+            name='facebookM2M',
+        ),
+        migrations.RemoveField(
+            model_name='language',
+            name='tesseract',
+        ),
+        migrations.AddField(
+            model_name='ocrboxmodel',
+            name='iso1_map',
+            field=models.JSONField(null=True),
+        ),
+        migrations.AddField(
+            model_name='ocrmodel',
+            name='iso1_map',
+            field=models.JSONField(null=True),
+        ),
+        migrations.AddField(
+            model_name='tslmodel',
+            name='iso1_map',
+            field=models.JSONField(null=True),
+        ),
+    ]
diff --git a/ocr_translate/models.py b/ocr_translate/models.py
@@ -19,17 +19,15 @@
 """Django models for the ocr_translate app."""
 import logging
 import re
+from importlib.metadata import entry_points
 from typing import Generator, Type, Union
 
-import pkg_resources
 from django.db import models
 from PIL.Image import Image as PILImage
 
 from . import queues
 from .messaging import Message
 
-LANG_LENGTH = 32
-
 logger = logging.getLogger('ocr.general')
 
 class OptionDict(models.Model):
@@ -47,10 +45,6 @@ class Language(models.Model):
     iso2t = models.CharField(max_length=8, unique=True)
     iso3 = models.CharField(max_length=32, unique=True)
 
-    easyocr = models.CharField(max_length=32, null=True)
-    tesseract = models.CharField(max_length=32, null=True)
-    facebookM2M = models.CharField(max_length=32, null=True)
-
     default_options = models.ForeignKey(
         OptionDict, on_delete=models.CASCADE, related_name='lang_default_options', null=True
         )
@@ -69,6 +63,9 @@ class Meta:
 
     entrypoint = models.CharField(max_length=128, null=True)
 
+    language_format = models.CharField(max_length=32, null=True)
+    iso1_map = models.JSONField(null=True)
+
     default_options = models.ForeignKey(
         OptionDict, on_delete=models.SET_NULL, related_name='used_by_%(class)s', null=True
         )
@@ -82,6 +79,12 @@ def __del__(self):
         except NotImplementedError:
             pass
 
+    def get_lang_code(self, lang: 'Language') -> str:
+        """Get the language code for a specific model"""
+        if isinstance(self.iso1_map, dict) and lang.iso1 in self.iso1_map:
+            return self.iso1_map[lang.iso1]
+        return getattr(lang, self.language_format or 'iso1')
+
     @classmethod
     def from_entrypoint(cls, name: str) -> Type['models.Model']:
         """Get the entrypoint specific TSL model class from the entrypoint name"""
@@ -92,7 +95,7 @@ def from_entrypoint(cls, name: str) -> Type['models.Model']:
         ept = obj.entrypoint
 
         logger.debug(f'Loading model {name} from entrypoint {cls.entrypoint_namespace}:{ept}')
-        for entrypoint in pkg_resources.iter_entry_points(cls.entrypoint_namespace, name=ept):
+        for entrypoint in entry_points(group=cls.entrypoint_namespace, name=ept):
             new_cls = entrypoint.load()
             break
         else:
@@ -116,8 +119,6 @@ class OCRModel(BaseModel):
 
     languages = models.ManyToManyField(Language, related_name='ocr_models')
 
-    language_format = models.CharField(max_length=32, null=True)
-
     def prepare_image(
             self,
             img: PILImage, bbox: tuple[int, int, int, int] = None
@@ -181,7 +182,7 @@ def ocr(
             logger.info('Running OCR')
 
             id_ = (bbox_obj.id, self.id, lang.id)
-            mlang = getattr(lang, self.language_format or 'iso1')
+            mlang = self.get_lang_code(lang)
             opt_dct = options_obj.options
             text = queues.ocr_queue.put(
                 id_=id_,
@@ -222,8 +223,6 @@ class OCRBoxModel(BaseModel):
 
     languages = models.ManyToManyField(Language, related_name='box_models')
 
-    language_format = models.CharField(max_length=32, null=True)
-
     def _box_detection(
             self,
             image: PILImage, options: dict = None
@@ -302,8 +301,6 @@ class TSLModel(BaseModel):
     src_languages = models.ManyToManyField(Language, related_name='tsl_models_src')
     dst_languages = models.ManyToManyField(Language, related_name='tsl_models_dst')
 
-    language_format = models.CharField(max_length=32, null=True)
-
     @staticmethod
     def pre_tokenize(
             text: str,
@@ -406,8 +403,8 @@ def translate(
                 msg={
                     'args': (
                         tokens,
-                        getattr(src, self.language_format),
-                        getattr(dst, self.language_format)
+                        self.get_lang_code(src),
+                        self.get_lang_code(dst),
                         ),
                     'kwargs': {'options': opt_dct},
                 },

diff --git a/ocr_translate/ocr_tsl/initializers.py b/ocr_translate/ocr_tsl/initializers.py
@@ -19,6 +19,7 @@
 """Initialize the server based on environment variables."""
 import json
 import logging
+from importlib.metadata import entry_points
 from pathlib import Path
 
 from django.db.models import Count
@@ -69,67 +70,93 @@ def auto_create_languages():
         opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
         l, _ = m.Language.objects.get_or_create(name=name, iso1=iso1, iso2t=iso2t, iso2b=iso2b, iso3=iso3)
         l.default_options = opt_obj
-        for k,v in lang.items():
-            setattr(l, k, v)
+        # for k,v in lang.items():
+        #     setattr(l, k, v)
         l.save()
 
-def auto_create_models():
-    """Create OCR and TSL models from json file. Also create default OptionDict"""
-    cwd = Path(__file__).parent
-    with open(cwd / 'models.json', encoding='utf-8') as f:
-        models = json.load(f)
+def load_ept_data(namespace):
+    """Load all entrypoints from a namespace into a list"""
+    res = []
+
+    for ept in entry_points(group=namespace):
+        # Copy required for pop on dct
+        res.append(ept.load().copy())
 
-    for box in models['box']:
+    return res
+
+def auto_create_box():
+    """Create OCRBoxModel objects from entrypoints."""
+    for box in load_ept_data('ocr_translate.box_data'):
         logger.debug(f'Creating box model: {box}')
         lang = box.pop('lang')
         lcode = box.pop('lang_code')
         entrypoint = box.pop('entrypoint')
+        iso1_map = box.pop('iso1_map', {})
         def_opt = box.pop('default_options', {})
         opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
         model, _ = m.OCRBoxModel.objects.get_or_create(**box)
         model.default_options = opt_obj
         model.entrypoint = entrypoint
         model.language_format = lcode
+        model.iso1_map = iso1_map
+        model.languages.clear()
         for l in lang:
             model.languages.add(m.Language.objects.get(iso1=l))
         model.save()
 
-    for ocr in models['ocr']:
+def auto_create_ocr():
+    """Create OCRModel objects from entrypoints."""
+    for ocr in load_ept_data('ocr_translate.ocr_data'):
         logger.debug(f'Creating ocr model: {ocr}')
         lang = ocr.pop('lang')
         lcode = ocr.pop('lang_code')
         entrypoint = ocr.pop('entrypoint')
+        iso1_map = ocr.pop('iso1_map', {})
         def_opt = ocr.pop('default_options', {})
         opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
         model, _ = m.OCRModel.objects.get_or_create(**ocr)
         model.default_options = opt_obj
         model.language_format = lcode
+        model.iso1_map = iso1_map
         model.entrypoint = entrypoint
+        model.languages.clear()
         for l in lang:
             model.languages.add(m.Language.objects.get(iso1=l))
         model.save()
 
-    for tsl in models['tsl']:
+def auto_create_tsl():
+    """Create TSLModel objects from entrypoints."""
+    for tsl in load_ept_data('ocr_translate.tsl_data'):
         logger.debug(f'Creating tsl model: {tsl}')
         src = tsl.pop('lang_src')
         dst = tsl.pop('lang_dst')
         lcode = tsl.pop('lang_code', None)
         entrypoint = tsl.pop('entrypoint')
+        iso1_map = tsl.pop('iso1_map', {})
         def_opt = tsl.pop('default_options', {})
         opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
         model, _ = m.TSLModel.objects.get_or_create(**tsl)
         model.default_options = opt_obj
         model.language_format = lcode
+        model.iso1_map = iso1_map
         model.entrypoint = entrypoint
+        model.src_languages.clear()
         for l in src:
             logger.debug(f'Adding src language: {l}')
             kwargs = {lcode: l}
             model.src_languages.add(*m.Language.objects.filter(**kwargs))
 
+        model.dst_languages.clear()
         for l in dst:
             logger.debug(f'Adding dst language: {l}')
             kwargs = {lcode: l}
             model.dst_languages.add(*m.Language.objects.filter(**kwargs))
         model.save()
 
+def auto_create_models():
+    """Create OCR and TSL models from json file. Also create default OptionDict"""
+    auto_create_box()
+    auto_create_ocr()
+    auto_create_tsl()
+
     m.OptionDict.objects.get_or_create(options={})
diff --git a/ocr_translate/ocr_tsl/models.json b/ocr_translate/ocr_tsl/models.json