Skip to content

Commit

Permalink
Merge pull request #610 from NatLibFi/issue603-shared-vocabs
Browse files Browse the repository at this point in the history
Store vocabs in AnnifRegistry so they are shared between projects
  • Loading branch information
osma authored Aug 18, 2022
2 parents 3fd2202 + e48a2fc commit c291930
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 31 deletions.
6 changes: 2 additions & 4 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import annif.corpus
import annif.suggestion
import annif.backend
import annif.vocab
from annif.datadir import DatadirMixin
from annif.exception import AnnifException, ConfigurationException, \
NotSupportedException, NotInitializedException
Expand Down Expand Up @@ -155,9 +154,8 @@ def vocab(self):
if self.vocab_spec is None:
raise ConfigurationException("vocab setting is missing",
project_id=self.project_id)
self._vocab = annif.vocab.get_vocab(self.vocab_spec,
self._base_datadir,
self.language)
self._vocab = self.registry.get_vocab(self.vocab_spec,
self.language)

return self._vocab

Expand Down
47 changes: 35 additions & 12 deletions annif/registry.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,41 @@
"""Registry that keeps track of Annif projects"""

import collections
import re
from flask import current_app
import annif
from annif.config import parse_config
from annif.project import Access, AnnifProject
from annif.vocab import AnnifVocabulary
from annif.util import parse_args

logger = annif.logger


class AnnifRegistry:
"""Class that keeps track of the Annif projects"""

# Note: The individual projects are stored in a shared static variable,
# keyed by the "registry ID" which is unique to the registry instance.
# This is done to make it possible to serialize AnnifRegistry instances
# without including the potentially huge project objects (which contain
# backends with large models, vocabularies with lots of concepts etc).
# Serialized AnnifRegistry instances can then be passed between
# processes when using the multiprocessing module.
"""Class that keeps track of the Annif projects and vocabularies"""

# Note: The individual projects and vocabularies are stored in shared
# static variables, keyed by the "registry ID" which is unique to the
# registry instance. This is done to make it possible to serialize
# AnnifRegistry instances without including the potentially huge objects
# (which contain backends with large models, vocabularies with lots of
# concepts etc). Serialized AnnifRegistry instances can then be passed
# between processes when using the multiprocessing module.
_projects = {}
_vocabs = {}

def __init__(self, projects_config_path, datadir, init_projects):
self._rid = id(self)
self._datadir = datadir
self._projects[self._rid] = \
self._create_projects(projects_config_path, datadir)
self._create_projects(projects_config_path)
self._vocabs[self._rid] = {}
if init_projects:
for project in self._projects[self._rid].values():
project.initialize()

def _create_projects(self, projects_config_path, datadir):
def _create_projects(self, projects_config_path):
# parse the configuration
config = parse_config(projects_config_path)

Expand All @@ -42,7 +48,7 @@ def _create_projects(self, projects_config_path, datadir):
for project_id in config.project_ids:
projects[project_id] = AnnifProject(project_id,
config[project_id],
datadir,
self._datadir,
self)
return projects

Expand All @@ -64,6 +70,23 @@ def get_project(self, project_id, min_access=Access.private):
except KeyError:
raise ValueError("No such project {}".format(project_id))

def get_vocab(self, vocab_spec, default_language):
"""Return an AnnifVocabulary corresponding to the vocab_spec. If no
language information is specified, use the given default language."""
match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
if match is None:
raise ValueError(
f"Invalid vocabulary specification: {vocab_spec}")
vocab_id = match.group(1)
posargs, kwargs = parse_args(match.group(3))
language = posargs[0] if posargs else default_language
vocab_key = (vocab_id, language)

if vocab_key not in self._vocabs[self._rid]:
self._vocabs[self._rid][vocab_key] = AnnifVocabulary(
vocab_id, self._datadir, language)
return self._vocabs[self._rid][vocab_key]


def initialize_projects(app):
projects_config_path = app.config['PROJECTS_CONFIG_PATH']
Expand Down
13 changes: 0 additions & 13 deletions annif/vocab.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,15 @@
"""Vocabulary management functionality for Annif"""

import os.path
import re
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException
from annif.util import parse_args

logger = annif.logger


def get_vocab(vocab_spec, datadir, default_language):
match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
if match is None:
raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
vocab_id = match.group(1)
posargs, kwargs = parse_args(match.group(3))
language = posargs[0] if posargs else default_language

return AnnifVocabulary(vocab_id, datadir, language)


class AnnifVocabulary(DatadirMixin):
"""Class representing a subject vocabulary which can be used by multiple
Annif projects."""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def load_dummy_vocab(tmpdir):
return vocab


def test_get_vocab_invalid():
def test_get_vocab_invalid(registry):
with pytest.raises(ValueError) as excinfo:
annif.vocab.get_vocab('', None, None)
registry.get_vocab('', None)
assert 'Invalid vocabulary specification' in str(excinfo.value)


Expand Down

0 comments on commit c291930

Please sign in to comment.