Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store vocabs in AnnifRegistry so they are shared between projects #610

Merged
merged 1 commit into from
Aug 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import annif.corpus
import annif.suggestion
import annif.backend
import annif.vocab
from annif.datadir import DatadirMixin
from annif.exception import AnnifException, ConfigurationException, \
NotSupportedException, NotInitializedException
Expand Down Expand Up @@ -155,9 +154,8 @@ def vocab(self):
if self.vocab_spec is None:
raise ConfigurationException("vocab setting is missing",
project_id=self.project_id)
self._vocab = annif.vocab.get_vocab(self.vocab_spec,
self._base_datadir,
self.language)
self._vocab = self.registry.get_vocab(self.vocab_spec,
self.language)

return self._vocab

Expand Down
47 changes: 35 additions & 12 deletions annif/registry.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,41 @@
"""Registry that keeps track of Annif projects"""

import collections
import re
from flask import current_app
import annif
from annif.config import parse_config
from annif.project import Access, AnnifProject
from annif.vocab import AnnifVocabulary
from annif.util import parse_args

logger = annif.logger


class AnnifRegistry:
"""Class that keeps track of the Annif projects"""

# Note: The individual projects are stored in a shared static variable,
# keyed by the "registry ID" which is unique to the registry instance.
# This is done to make it possible to serialize AnnifRegistry instances
# without including the potentially huge project objects (which contain
# backends with large models, vocabularies with lots of concepts etc).
# Serialized AnnifRegistry instances can then be passed between
# processes when using the multiprocessing module.
"""Class that keeps track of the Annif projects and vocabularies"""

# Note: The individual projects and vocabularies are stored in shared
# static variables, keyed by the "registry ID" which is unique to the
# registry instance. This is done to make it possible to serialize
# AnnifRegistry instances without including the potentially huge objects
# (which contain backends with large models, vocabularies with lots of
# concepts etc). Serialized AnnifRegistry instances can then be passed
# between processes when using the multiprocessing module.
_projects = {}
_vocabs = {}

def __init__(self, projects_config_path, datadir, init_projects):
self._rid = id(self)
self._datadir = datadir
self._projects[self._rid] = \
self._create_projects(projects_config_path, datadir)
self._create_projects(projects_config_path)
self._vocabs[self._rid] = {}
if init_projects:
for project in self._projects[self._rid].values():
project.initialize()

def _create_projects(self, projects_config_path, datadir):
def _create_projects(self, projects_config_path):
# parse the configuration
config = parse_config(projects_config_path)

Expand All @@ -42,7 +48,7 @@ def _create_projects(self, projects_config_path, datadir):
for project_id in config.project_ids:
projects[project_id] = AnnifProject(project_id,
config[project_id],
datadir,
self._datadir,
self)
return projects

Expand All @@ -64,6 +70,23 @@ def get_project(self, project_id, min_access=Access.private):
except KeyError:
raise ValueError("No such project {}".format(project_id))

def get_vocab(self, vocab_spec, default_language):
"""Return an AnnifVocabulary corresponding to the vocab_spec. If no
language information is specified, use the given default language."""
match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
if match is None:
raise ValueError(
f"Invalid vocabulary specification: {vocab_spec}")
vocab_id = match.group(1)
posargs, kwargs = parse_args(match.group(3))
language = posargs[0] if posargs else default_language
vocab_key = (vocab_id, language)

if vocab_key not in self._vocabs[self._rid]:
self._vocabs[self._rid][vocab_key] = AnnifVocabulary(
vocab_id, self._datadir, language)
return self._vocabs[self._rid][vocab_key]


def initialize_projects(app):
projects_config_path = app.config['PROJECTS_CONFIG_PATH']
Expand Down
13 changes: 0 additions & 13 deletions annif/vocab.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,15 @@
"""Vocabulary management functionality for Annif"""

import os.path
import re
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException
from annif.util import parse_args

logger = annif.logger


def get_vocab(vocab_spec, datadir, default_language):
match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
if match is None:
raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
vocab_id = match.group(1)
posargs, kwargs = parse_args(match.group(3))
language = posargs[0] if posargs else default_language

return AnnifVocabulary(vocab_id, datadir, language)


class AnnifVocabulary(DatadirMixin):
"""Class representing a subject vocabulary which can be used by multiple
Annif projects."""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def load_dummy_vocab(tmpdir):
return vocab


def test_get_vocab_invalid():
def test_get_vocab_invalid(registry):
with pytest.raises(ValueError) as excinfo:
annif.vocab.get_vocab('', None, None)
registry.get_vocab('', None)
assert 'Invalid vocabulary specification' in str(excinfo.value)


Expand Down