Skip to content

Commit

Permalink
Feature improve plugginability (#7)
Browse files Browse the repository at this point in the history
* Models now generated from entrypoints. Language codes are more customizable using a dict map.

* Updated changelog for v0.2.1

* Improved doc for plugins

* Trim unused code

* Bump version number

* Added python3.11 to classifiers

* Fixed link
  • Loading branch information
Crivella authored Sep 20, 2023
1 parent b28a143 commit 2d19366
Show file tree
Hide file tree
Showing 12 changed files with 327 additions and 125 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

List of changes between versions

## 0.2.1

Plugins can now be used to also add models to the database via the following entrypoints:

- `ocr_translate.box_data`
- `ocr_translate.ocr_data`
- `ocr_translate.tsl_data`

The entrypoint should point to a `dict` with the info to create the model.
See [init of plugins](ocr_translate/plugins/__init__.py) for example (care that box/ocr/tsl may need to define different keys).

Information about model-specific language codes is now encoded into an `iso1_map` field of the model.

- Before new models with custom codes in a plugin would require to also edit the main repo and adding a new column to languages in the database.
- Now the plugin can set the `lang_code` to whatever is closest to the model codes, and overwrite what does not match using `iso1_map`, by mapping iso-639-1 codes to the model-specific ones.

Tag only without release as the changes still requires plugins to be baked in with the installer (they cannot be dynamically added without an hack-ish solution).

## 0.2.0

Restructured the code to make it pluginable.
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ before installing the python package.

## Writing plugins for the server

Since version 0.2.1 plugins can now be used to also add models to the database via the following entrypoints. See [CHANGELOG](/CHANGELOG.md) for more details.

Since version 0.2.0 the server has been made pluginable.
You can write a plugin for a model/web-service that has not yet been implemented, by subclassing the following models

Expand Down
1 change: 0 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ pyinstaller \
--name run_server \
--icon icon.ico \
--add-data "ocr_translate/ocr_tsl/languages.json:ocr_translate/ocr_tsl" \
--add-data "ocr_translate/ocr_tsl/models.json:ocr_translate/ocr_tsl" \
--collect-all djang-ocr_translate \
--collect-all torch \
--collect-all torchvision \
Expand Down
2 changes: 1 addition & 1 deletion ocr_translate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
###################################################################################
"""OCR and translation of images."""

__version__ = '0.2.0'
__version__ = '0.2.1'
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 4.2.4 on 2023-09-20 11:37

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('ocr_translate', '0008_ocrboxmodel_entrypoint_and_more'),
]

operations = [
migrations.RemoveField(
model_name='language',
name='easyocr',
),
migrations.RemoveField(
model_name='language',
name='facebookM2M',
),
migrations.RemoveField(
model_name='language',
name='tesseract',
),
migrations.AddField(
model_name='ocrboxmodel',
name='iso1_map',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='ocrmodel',
name='iso1_map',
field=models.JSONField(null=True),
),
migrations.AddField(
model_name='tslmodel',
name='iso1_map',
field=models.JSONField(null=True),
),
]
31 changes: 14 additions & 17 deletions ocr_translate/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@
"""Django models for the ocr_translate app."""
import logging
import re
from importlib.metadata import entry_points
from typing import Generator, Type, Union

import pkg_resources
from django.db import models
from PIL.Image import Image as PILImage

from . import queues
from .messaging import Message

LANG_LENGTH = 32

logger = logging.getLogger('ocr.general')

class OptionDict(models.Model):
Expand All @@ -47,10 +45,6 @@ class Language(models.Model):
iso2t = models.CharField(max_length=8, unique=True)
iso3 = models.CharField(max_length=32, unique=True)

easyocr = models.CharField(max_length=32, null=True)
tesseract = models.CharField(max_length=32, null=True)
facebookM2M = models.CharField(max_length=32, null=True)

default_options = models.ForeignKey(
OptionDict, on_delete=models.CASCADE, related_name='lang_default_options', null=True
)
Expand All @@ -69,6 +63,9 @@ class Meta:

entrypoint = models.CharField(max_length=128, null=True)

language_format = models.CharField(max_length=32, null=True)
iso1_map = models.JSONField(null=True)

default_options = models.ForeignKey(
OptionDict, on_delete=models.SET_NULL, related_name='used_by_%(class)s', null=True
)
Expand All @@ -82,6 +79,12 @@ def __del__(self):
except NotImplementedError:
pass

def get_lang_code(self, lang: 'Language') -> str:
"""Get the language code for a specific model"""
if isinstance(self.iso1_map, dict) and lang.iso1 in self.iso1_map:
return self.iso1_map[lang.iso1]
return getattr(lang, self.language_format or 'iso1')

@classmethod
def from_entrypoint(cls, name: str) -> Type['models.Model']:
"""Get the entrypoint specific TSL model class from the entrypoint name"""
Expand All @@ -92,7 +95,7 @@ def from_entrypoint(cls, name: str) -> Type['models.Model']:
ept = obj.entrypoint

logger.debug(f'Loading model {name} from entrypoint {cls.entrypoint_namespace}:{ept}')
for entrypoint in pkg_resources.iter_entry_points(cls.entrypoint_namespace, name=ept):
for entrypoint in entry_points(group=cls.entrypoint_namespace, name=ept):
new_cls = entrypoint.load()
break
else:
Expand All @@ -116,8 +119,6 @@ class OCRModel(BaseModel):

languages = models.ManyToManyField(Language, related_name='ocr_models')

language_format = models.CharField(max_length=32, null=True)

def prepare_image(
self,
img: PILImage, bbox: tuple[int, int, int, int] = None
Expand Down Expand Up @@ -181,7 +182,7 @@ def ocr(
logger.info('Running OCR')

id_ = (bbox_obj.id, self.id, lang.id)
mlang = getattr(lang, self.language_format or 'iso1')
mlang = self.get_lang_code(lang)
opt_dct = options_obj.options
text = queues.ocr_queue.put(
id_=id_,
Expand Down Expand Up @@ -222,8 +223,6 @@ class OCRBoxModel(BaseModel):

languages = models.ManyToManyField(Language, related_name='box_models')

language_format = models.CharField(max_length=32, null=True)

def _box_detection(
self,
image: PILImage, options: dict = None
Expand Down Expand Up @@ -302,8 +301,6 @@ class TSLModel(BaseModel):
src_languages = models.ManyToManyField(Language, related_name='tsl_models_src')
dst_languages = models.ManyToManyField(Language, related_name='tsl_models_dst')

language_format = models.CharField(max_length=32, null=True)

@staticmethod
def pre_tokenize(
text: str,
Expand Down Expand Up @@ -406,8 +403,8 @@ def translate(
msg={
'args': (
tokens,
getattr(src, self.language_format),
getattr(dst, self.language_format)
self.get_lang_code(src),
self.get_lang_code(dst),
),
'kwargs': {'options': opt_dct},
},
Expand Down
47 changes: 37 additions & 10 deletions ocr_translate/ocr_tsl/initializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"""Initialize the server based on environment variables."""
import json
import logging
from importlib.metadata import entry_points
from pathlib import Path

from django.db.models import Count
Expand Down Expand Up @@ -69,67 +70,93 @@ def auto_create_languages():
opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
l, _ = m.Language.objects.get_or_create(name=name, iso1=iso1, iso2t=iso2t, iso2b=iso2b, iso3=iso3)
l.default_options = opt_obj
for k,v in lang.items():
setattr(l, k, v)
# for k,v in lang.items():
# setattr(l, k, v)
l.save()

def auto_create_models():
"""Create OCR and TSL models from json file. Also create default OptionDict"""
cwd = Path(__file__).parent
with open(cwd / 'models.json', encoding='utf-8') as f:
models = json.load(f)
def load_ept_data(namespace):
"""Load all entrypoints from a namespace into a list"""
res = []

for ept in entry_points(group=namespace):
# Copy required for pop on dct
res.append(ept.load().copy())

for box in models['box']:
return res

def auto_create_box():
"""Create OCRBoxModel objects from entrypoints."""
for box in load_ept_data('ocr_translate.box_data'):
logger.debug(f'Creating box model: {box}')
lang = box.pop('lang')
lcode = box.pop('lang_code')
entrypoint = box.pop('entrypoint')
iso1_map = box.pop('iso1_map', {})
def_opt = box.pop('default_options', {})
opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
model, _ = m.OCRBoxModel.objects.get_or_create(**box)
model.default_options = opt_obj
model.entrypoint = entrypoint
model.language_format = lcode
model.iso1_map = iso1_map
model.languages.clear()
for l in lang:
model.languages.add(m.Language.objects.get(iso1=l))
model.save()

for ocr in models['ocr']:
def auto_create_ocr():
"""Create OCRModel objects from entrypoints."""
for ocr in load_ept_data('ocr_translate.ocr_data'):
logger.debug(f'Creating ocr model: {ocr}')
lang = ocr.pop('lang')
lcode = ocr.pop('lang_code')
entrypoint = ocr.pop('entrypoint')
iso1_map = ocr.pop('iso1_map', {})
def_opt = ocr.pop('default_options', {})
opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
model, _ = m.OCRModel.objects.get_or_create(**ocr)
model.default_options = opt_obj
model.language_format = lcode
model.iso1_map = iso1_map
model.entrypoint = entrypoint
model.languages.clear()
for l in lang:
model.languages.add(m.Language.objects.get(iso1=l))
model.save()

for tsl in models['tsl']:
def auto_create_tsl():
"""Create TSLModel objects from entrypoints."""
for tsl in load_ept_data('ocr_translate.tsl_data'):
logger.debug(f'Creating tsl model: {tsl}')
src = tsl.pop('lang_src')
dst = tsl.pop('lang_dst')
lcode = tsl.pop('lang_code', None)
entrypoint = tsl.pop('entrypoint')
iso1_map = tsl.pop('iso1_map', {})
def_opt = tsl.pop('default_options', {})
opt_obj, _ = m.OptionDict.objects.get_or_create(options=def_opt)
model, _ = m.TSLModel.objects.get_or_create(**tsl)
model.default_options = opt_obj
model.language_format = lcode
model.iso1_map = iso1_map
model.entrypoint = entrypoint
model.src_languages.clear()
for l in src:
logger.debug(f'Adding src language: {l}')
kwargs = {lcode: l}
model.src_languages.add(*m.Language.objects.filter(**kwargs))

model.dst_languages.clear()
for l in dst:
logger.debug(f'Adding dst language: {l}')
kwargs = {lcode: l}
model.dst_languages.add(*m.Language.objects.filter(**kwargs))
model.save()

def auto_create_models():
"""Create OCR and TSL models from json file. Also create default OptionDict"""
auto_create_box()
auto_create_ocr()
auto_create_tsl()

m.OptionDict.objects.get_or_create(options={})
86 changes: 0 additions & 86 deletions ocr_translate/ocr_tsl/models.json

This file was deleted.

Loading

0 comments on commit 2d19366

Please sign in to comment.