From f5f32dda465e46437eda8ff817ad98e899aa6f78 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 10:36:39 +0300 Subject: [PATCH 01/28] Run "monkeytype apply --pep_563" on all modules --- annif/__init__.py | 13 +++- annif/analyzer/__init__.py | 15 ++++- annif/analyzer/analyzer.py | 10 +-- annif/analyzer/simple.py | 7 +- annif/analyzer/simplemma.py | 7 +- annif/analyzer/snowball.py | 6 +- annif/analyzer/spacy.py | 7 +- annif/analyzer/voikko.py | 8 ++- annif/backend/__init__.py | 43 ++++++++---- annif/backend/backend.py | 82 ++++++++++++++++++----- annif/backend/dummy.py | 19 ++++-- annif/backend/ensemble.py | 54 ++++++++++----- annif/backend/fasttext.py | 47 +++++++++---- annif/backend/http.py | 18 +++-- annif/backend/mllm.py | 47 +++++++++---- annif/backend/nn_ensemble.py | 63 ++++++++++++----- annif/backend/omikuji.py | 28 ++++++-- annif/backend/pav.py | 36 ++++++++-- annif/backend/stwfsa.py | 23 +++++-- annif/backend/svc.py | 33 ++++++--- annif/backend/tfidf.py | 34 +++++++--- annif/backend/yake.py | 54 +++++++++------ annif/cli_util.py | 65 ++++++++++++++---- annif/config.py | 31 +++++---- annif/corpus/combine.py | 12 +++- annif/corpus/skos.py | 39 ++++++++--- annif/corpus/subject.py | 82 +++++++++++++---------- annif/corpus/types.py | 6 +- annif/datadir.py | 6 +- annif/eval.py | 106 +++++++++++++++++++++++++---- annif/exception.py | 15 ++++- annif/lexical/mllm.py | 98 +++++++++++++++++++++------ annif/lexical/tokenset.py | 28 +++++--- annif/lexical/util.py | 21 +++++- annif/openapi/validation.py | 16 ++++- annif/project.py | 115 +++++++++++++++++++++++++------- annif/registry.py | 38 +++++++---- annif/rest.py | 103 ++++++++++++++++++++++++---- annif/suggestion.py | 55 ++++++++++----- annif/transform/__init__.py | 23 ++++++- annif/transform/inputlimiter.py | 16 ++++- annif/transform/langfilter.py | 16 ++++- annif/transform/transform.py | 63 +++++++++++++++-- annif/vocab.py | 32 ++++++--- 44 files changed, 1254 insertions(+), 386 deletions(-) diff --git a/annif/__init__.py b/annif/__init__.py index f4a5831f5..221835da5 100644 --- a/annif/__init__.py +++ b/annif/__init__.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +from __future__ import annotations + import logging import os import os.path @@ -8,10 +10,15 @@ logger = logging.getLogger("annif") logger.setLevel(level=logging.INFO) +from typing import TYPE_CHECKING, Optional + import annif.backend # noqa +if TYPE_CHECKING: + from flask.app import Flask + -def create_flask_app(config_name=None): +def create_flask_app(config_name: None = None) -> flask.app.Flask: """Create a Flask app to be used by the CLI.""" from flask import Flask @@ -23,7 +30,7 @@ def create_flask_app(config_name=None): return app -def create_app(config_name=None): +def create_app(config_name: Optional[str] = None) -> flask.app.Flask: """Create a Connexion app to be used for the API.""" # 'cxapp' here is the Connexion application that has a normal Flask app # as a property (cxapp.app) @@ -60,7 +67,7 @@ def create_app(config_name=None): return cxapp.app -def _get_config_name(config_name): +def _get_config_name(config_name: Optional[str]) -> str: if config_name is None: config_name = os.environ.get("ANNIF_CONFIG") if config_name is None: diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index eacf3d001..0e8d7c189 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -1,12 +1,21 @@ """Collection of language-specific analyzers and analyzer registry for Annif""" +from __future__ import annotations import re +from typing import TYPE_CHECKING, Union import annif from annif.util import parse_args from . import simple, simplemma, snowball +if TYPE_CHECKING: + from annif.analyzer.simple import SimpleAnalyzer + from annif.analyzer.simplemma import SimplemmaAnalyzer + from annif.analyzer.snowball import SnowballAnalyzer + from annif.analyzer.spacy import SpacyAnalyzer + from annif.analyzer.voikko import VoikkoAnalyzer + _analyzers = {} @@ -14,7 +23,11 @@ def register_analyzer(analyzer): _analyzers[analyzer.name] = analyzer -def get_analyzer(analyzerspec): +def get_analyzer( + analyzerspec: str, +) -> Union[ + SimplemmaAnalyzer, SimpleAnalyzer, SnowballAnalyzer, SpacyAnalyzer, VoikkoAnalyzer +]: match = re.match(r"(\w+)(\((.*)\))?", analyzerspec) if match is None: raise ValueError("Invalid analyzer specification {}".format(analyzerspec)) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 37457069d..5eb45853d 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -1,8 +1,10 @@ """Common functionality for analyzers.""" +from __future__ import annotations import abc import functools import unicodedata +from typing import TYPE_CHECKING, Any, List, Union _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -15,18 +17,18 @@ class Analyzer(metaclass=abc.ABCMeta): name = None token_min_length = 3 # default value, can be overridden in instances - def __init__(self, **kwargs): + def __init__(self, **kwargs) -> None: if _KEY_TOKEN_MIN_LENGTH in kwargs: self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH]) - def tokenize_sentences(self, text): + def tokenize_sentences(self, text: str) -> List[Union[Any, str]]: """Tokenize a piece of text (e.g. a document) into sentences.""" import nltk.tokenize return nltk.tokenize.sent_tokenize(text) @functools.lru_cache(maxsize=50000) - def is_valid_token(self, word): + def is_valid_token(self, word: str) -> bool: """Return True if the word is an acceptable token.""" if len(word) < self.token_min_length: return False @@ -36,7 +38,7 @@ def is_valid_token(self, word): return True return False - def tokenize_words(self, text, filter=True): + def tokenize_words(self, text: str, filter: bool = True) -> List[Union[Any, str]]: """Tokenize a piece of text (e.g. a sentence) into words. If filter=True (default), only return valid tokens (e.g. not punctuation, numbers or very short words)""" diff --git a/annif/analyzer/simple.py b/annif/analyzer/simple.py index 46a8f92f3..15e386d0a 100644 --- a/annif/analyzer/simple.py +++ b/annif/analyzer/simple.py @@ -1,4 +1,7 @@ """Simple analyzer for Annif. Only folds words to lower case.""" +from __future__ import annotations + +from typing import TYPE_CHECKING from . import analyzer @@ -6,9 +9,9 @@ class SimpleAnalyzer(analyzer.Analyzer): name = "simple" - def __init__(self, param, **kwargs): + def __init__(self, param: None, **kwargs) -> None: self.param = param super().__init__(**kwargs) - def _normalize_word(self, word): + def _normalize_word(self, word: str) -> str: return word.lower() diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py index 02976982b..9ee0fcbb8 100644 --- a/annif/analyzer/simplemma.py +++ b/annif/analyzer/simplemma.py @@ -1,4 +1,7 @@ """Simplemma analyzer for Annif, based on simplemma lemmatizer.""" +from __future__ import annotations + +from typing import TYPE_CHECKING import simplemma @@ -8,9 +11,9 @@ class SimplemmaAnalyzer(analyzer.Analyzer): name = "simplemma" - def __init__(self, param, **kwargs): + def __init__(self, param: str, **kwargs) -> None: self.lang = param super().__init__(**kwargs) - def _normalize_word(self, word): + def _normalize_word(self, word: str) -> str: return simplemma.lemmatize(word, lang=self.lang) diff --git a/annif/analyzer/snowball.py b/annif/analyzer/snowball.py index c13c4e904..7f0b370d8 100644 --- a/annif/analyzer/snowball.py +++ b/annif/analyzer/snowball.py @@ -1,6 +1,8 @@ """Snowball analyzer for Annif, based on nltk Snowball stemmer.""" +from __future__ import annotations import functools +from typing import TYPE_CHECKING from . import analyzer @@ -8,7 +10,7 @@ class SnowballAnalyzer(analyzer.Analyzer): name = "snowball" - def __init__(self, param, **kwargs): + def __init__(self, param: str, **kwargs) -> None: self.param = param import nltk.stem.snowball @@ -16,5 +18,5 @@ def __init__(self, param, **kwargs): super().__init__(**kwargs) @functools.lru_cache(maxsize=500000) - def _normalize_word(self, word): + def _normalize_word(self, word: str) -> str: return self.stemmer.stem(word.lower()) diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py index 212a3a5f6..d3a4c649e 100644 --- a/annif/analyzer/spacy.py +++ b/annif/analyzer/spacy.py @@ -1,4 +1,7 @@ """spaCy analyzer for Annif which uses spaCy for lemmatization""" +from __future__ import annotations + +from typing import TYPE_CHECKING, List import annif.util from annif.exception import OperationFailedException @@ -11,7 +14,7 @@ class SpacyAnalyzer(analyzer.Analyzer): name = "spacy" - def __init__(self, param, **kwargs): + def __init__(self, param: str, **kwargs) -> None: import spacy self.param = param @@ -28,7 +31,7 @@ def __init__(self, param, **kwargs): self.lowercase = False super().__init__(**kwargs) - def tokenize_words(self, text, filter=True): + def tokenize_words(self, text: str, filter: bool = True) -> List[str]: lemmas = [ lemma for lemma in (token.lemma_ for token in self.nlp(text.strip())) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index d111da25e..24db55918 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -1,6 +1,8 @@ """Voikko analyzer for Annif, based on libvoikko library.""" +from __future__ import annotations import functools +from typing import TYPE_CHECKING, Dict, Optional import voikko.libvoikko @@ -10,12 +12,12 @@ class VoikkoAnalyzer(analyzer.Analyzer): name = "voikko" - def __init__(self, param, **kwargs): + def __init__(self, param: str, **kwargs) -> None: self.param = param self.voikko = None super().__init__(**kwargs) - def __getstate__(self): + def __getstate__(self) -> Dict[str, Optional[str]]: """Return the state of the object for pickling purposes. The Voikko instance is set to None because as a ctypes object it cannot be pickled.""" @@ -23,7 +25,7 @@ def __getstate__(self): return {"param": self.param, "voikko": None} @functools.lru_cache(maxsize=500000) - def _normalize_word(self, word): + def _normalize_word(self, word: str) -> str: if self.voikko is None: self.voikko = voikko.libvoikko.Voikko(self.param) result = self.voikko.analyze(word) diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 80ede0720..08957bf02 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -1,20 +1,37 @@ """Registry of backend types for Annif""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Type + +if TYPE_CHECKING: + from annif.backend.dummy import DummyBackend + from annif.backend.ensemble import EnsembleBackend + from annif.backend.fasttext import FastTextBackend + from annif.backend.http import HTTPBackend + from annif.backend.mllm import MLLMBackend + from annif.backend.nn_ensemble import NNEnsembleBackend + from annif.backend.omikuji import OmikujiBackend + from annif.backend.pav import PAVBackend + from annif.backend.stwfsa import StwfsaBackend + from annif.backend.svc import SVCBackend + from annif.backend.tfidf import TFIDFBackend + from annif.backend.yake import YakeBackend # define functions for lazily importing each backend (alphabetical order) -def _dummy(): +def _dummy() -> Type[DummyBackend]: from . import dummy return dummy.DummyBackend -def _ensemble(): +def _ensemble() -> Type[EnsembleBackend]: from . import ensemble return ensemble.EnsembleBackend -def _fasttext(): +def _fasttext() -> Type[FastTextBackend]: try: from . import fasttext @@ -23,19 +40,19 @@ def _fasttext(): raise ValueError("fastText not available, cannot use fasttext backend") -def _http(): +def _http() -> Type[HTTPBackend]: from . import http return http.HTTPBackend -def _mllm(): +def _mllm() -> Type[MLLMBackend]: from . import mllm return mllm.MLLMBackend -def _nn_ensemble(): +def _nn_ensemble() -> Type[NNEnsembleBackend]: try: from . import nn_ensemble @@ -46,7 +63,7 @@ def _nn_ensemble(): ) -def _omikuji(): +def _omikuji() -> Type[OmikujiBackend]: try: from . import omikuji @@ -55,13 +72,13 @@ def _omikuji(): raise ValueError("Omikuji not available, cannot use omikuji backend") -def _pav(): +def _pav() -> Type[PAVBackend]: from . import pav return pav.PAVBackend -def _stwfsa(): +def _stwfsa() -> Type[StwfsaBackend]: try: from . import stwfsa @@ -70,19 +87,19 @@ def _stwfsa(): raise ValueError("STWFSA not available, cannot use stwfsa backend") -def _svc(): +def _svc() -> Type[SVCBackend]: from . import svc return svc.SVCBackend -def _tfidf(): +def _tfidf() -> Type[TFIDFBackend]: from . import tfidf return tfidf.TFIDFBackend -def _yake(): +def _yake() -> Type[YakeBackend]: try: from . import yake @@ -108,7 +125,7 @@ def _yake(): } -def get_backend(backend_id): +def get_backend(backend_id: str) -> Any: if backend_id in _backend_fns: return _backend_fns[backend_id]() else: diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 754d66111..8fd40257d 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -1,13 +1,27 @@ """Common functionality for backends.""" +from __future__ import annotations import abc import os.path from datetime import datetime, timezone from glob import glob +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from annif import logger from annif.suggestion import SuggestionBatch +if TYPE_CHECKING: + from unittest.mock import Mock + + from annif.corpus.document import ( + DocumentDirectory, + DocumentFile, + DocumentList, + LimitingDocumentCorpus, + TransformingDocumentCorpus, + ) + from annif.project import AnnifProject + class AnnifBackend(metaclass=abc.ABCMeta): """Base class for Annif backends that perform analysis. The @@ -17,7 +31,9 @@ class AnnifBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"limit": 100} - def __init__(self, backend_id, config_params, project): + def __init__( + self, backend_id: str, config_params: Any, project: Union[Mock, AnnifProject] + ) -> None: """Initialize backend with specific parameters. The parameters are a dict. Keys and values depend on the specific backend type.""" @@ -26,22 +42,22 @@ def __init__(self, backend_id, config_params, project): self.project = project self.datadir = project.datadir - def default_params(self): + def default_params(self) -> Dict[str, Union[str, bool, int]]: return self.DEFAULT_PARAMETERS @property - def params(self): + def params(self) -> Dict[str, Any]: params = {} params.update(self.default_params()) params.update(self.config_params) return params @property - def is_trained(self): + def is_trained(self) -> bool: return bool(glob(os.path.join(self.datadir, "*"))) @property - def modification_time(self): + def modification_time(self) -> Optional[datetime.datetime]: mtimes = [ datetime.utcfromtimestamp(os.path.getmtime(p)) for p in glob(os.path.join(self.datadir, "*")) @@ -51,23 +67,44 @@ def modification_time(self): return None return most_recent.replace(tzinfo=timezone.utc) - def _get_backend_params(self, params): + def _get_backend_params( + self, + params: Optional[ + Union[Dict[str, str], Dict[str, int], Dict[str, Union[float, int]]] + ], + ) -> Dict[str, Any]: backend_params = dict(self.params) if params is not None: backend_params.update(params) return backend_params - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: TransformingDocumentCorpus, + params: Dict[str, Union[int, str]], + jobs: int = 0, + ) -> None: """This method can be overridden by backends. It implements the train functionality, with pre-processed parameters.""" pass # default is to do nothing, subclasses may override - def train(self, corpus, params=None, jobs=0): + def train( + self, + corpus: Union[ + str, + TransformingDocumentCorpus, + DocumentList, + DocumentFile, + DocumentDirectory, + ], + params: Optional[Union[Dict[str, Union[float, int]], Dict[str, int]]] = None, + jobs: int = 0, + ) -> None: """Train the model on the given document or subject corpus.""" beparams = self._get_backend_params(params) return self._train(corpus, params=beparams, jobs=jobs) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: """This method can be overridden by backends. It should cause the backend to pre-load all data it needs during operation. If parallel is True, the backend should expect to be used for @@ -80,7 +117,9 @@ def _suggest(self, text, params): document, with pre-processed parameters.""" pass # pragma: no cover - def _suggest_batch(self, texts, params): + def _suggest_batch( + self, texts: Union[str, List[str]], params: Dict[str, Any] + ) -> SuggestionBatch: """This method can be implemented by backends to use batching of documents in their operations. This default implementation uses the regular suggest functionality.""" @@ -90,22 +129,26 @@ def _suggest_batch(self, texts, params): limit=int(params.get("limit")), ) - def suggest(self, texts, params=None): + def suggest( + self, + texts: Union[str, List[str]], + params: Optional[Union[Dict[str, str], Dict[str, int]]] = None, + ) -> SuggestionBatch: """Suggest subjects for the input documents and return a list of subject sets represented as a list of SubjectSuggestion objects.""" beparams = self._get_backend_params(params) self.initialize() return self._suggest_batch(texts, params=beparams) - def debug(self, message): + def debug(self, message: str) -> None: """Log a debug message from this backend""" logger.debug("Backend {}: {}".format(self.backend_id, message)) - def info(self, message): + def info(self, message: str) -> None: """Log an info message from this backend""" logger.info("Backend {}: {}".format(self.backend_id, message)) - def warning(self, message): + def warning(self, message: str) -> None: """Log a warning message from this backend""" logger.warning("Backend {}: {}".format(self.backend_id, message)) @@ -119,7 +162,16 @@ def _learn(self, corpus, params): functionality, with pre-processed parameters.""" pass # pragma: no cover - def learn(self, corpus, params=None): + def learn( + self, + corpus: Union[ + DocumentDirectory, + TransformingDocumentCorpus, + LimitingDocumentCorpus, + DocumentFile, + ], + params: Optional[Dict[str, int]] = None, + ) -> None: """Further train the model on the given document or subject corpus.""" beparams = self._get_backend_params(params) return self._learn(corpus, params=beparams) diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index 9d60b0798..4d8bc9f6e 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -1,10 +1,15 @@ """Dummy backend for testing basic interaction of projects and backends""" +from __future__ import annotations +from typing import TYPE_CHECKING, Any, Dict, List, Union from annif.suggestion import SubjectSuggestion from . import backend +if TYPE_CHECKING: + from annif.corpus.document import DocumentDirectory, TransformingDocumentCorpus + class DummyBackend(backend.AnnifLearningBackend): name = "dummy" @@ -13,13 +18,15 @@ class DummyBackend(backend.AnnifLearningBackend): is_trained = True modification_time = None - def default_params(self): + def default_params(self) -> Dict[str, int]: return backend.AnnifBackend.DEFAULT_PARAMETERS - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: self.initialized = True - def _suggest(self, text, params): + def _suggest( + self, text: str, params: Dict[str, Union[int, str]] + ) -> List[Union[SubjectSuggestion, Any]]: score = float(params.get("score", 1.0)) # Ensure tests fail if "text" with wrong type ends up here @@ -37,7 +44,11 @@ def _suggest(self, text, params): return [SubjectSuggestion(subject_id=subject_id, score=score)] - def _learn(self, corpus, params): + def _learn( + self, + corpus: Union[TransformingDocumentCorpus, DocumentDirectory], + params: Dict[str, Union[int, str]], + ) -> None: # in this dummy backend we "learn" by picking up the subject ID # of the first subject of the first document in the learning set # and using that in subsequent analysis results diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index 918a41444..b7c049421 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -1,5 +1,7 @@ """Ensemble backend that combines results from multiple projects""" +from __future__ import annotations +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import annif.eval import annif.parallel @@ -9,11 +11,18 @@ from . import backend, hyperopt +if TYPE_CHECKING: + from optuna.study.study import Study + from optuna.trial._trial import Trial + + from annif.backend.hyperopt import HPRecommendation + from annif.corpus.document import DocumentDirectory, DocumentFile + class BaseEnsembleBackend(backend.AnnifBackend): """Base class for ensemble backends""" - def _get_sources_attribute(self, attr): + def _get_sources_attribute(self, attr: str) -> List[Optional[bool]]: params = self._get_backend_params(None) sources = annif.util.parse_sources(params["sources"]) return [ @@ -21,20 +30,27 @@ def _get_sources_attribute(self, attr): for project_id, _ in sources ] - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: # initialize all the source projects params = self._get_backend_params(None) for project_id, _ in annif.util.parse_sources(params["sources"]): project = self.project.registry.get_project(project_id) project.initialize(parallel) - def _suggest_with_sources(self, texts, sources): + def _suggest_with_sources( + self, texts: List[str], sources: List[Tuple[str, float]] + ) -> Dict[str, SuggestionBatch]: return { project_id: self.project.registry.get_project(project_id).suggest(texts) for project_id, _ in sources } - def _merge_source_batches(self, batch_by_source, sources, params): + def _merge_source_batches( + self, + batch_by_source: Dict[str, SuggestionBatch], + sources: List[Tuple[str, float]], + params: Dict[str, Union[int, str]], + ) -> SuggestionBatch: """Merge the given SuggestionBatches from each source into a single SuggestionBatch. The default implementation computes a weighted average based on the weights given in the sources tuple. Intended @@ -46,7 +62,9 @@ def _merge_source_batches(self, batch_by_source, sources, params): limit=int(params["limit"]) ) - def _suggest_batch(self, texts, params): + def _suggest_batch( + self, texts: List[str], params: Dict[str, Union[int, float, str]] + ) -> SuggestionBatch: sources = annif.util.parse_sources(params["sources"]) batch_by_source = self._suggest_with_sources(texts, sources) return self._merge_source_batches(batch_by_source, sources, params) @@ -55,7 +73,9 @@ def _suggest_batch(self, texts, params): class EnsembleOptimizer(hyperopt.HyperparameterOptimizer): """Hyperparameter optimizer for the ensemble backend""" - def __init__(self, backend, corpus, metric): + def __init__( + self, backend: "EnsembleBackend", corpus: DocumentDirectory, metric: str + ) -> None: super().__init__(backend, corpus, metric) self._sources = [ project_id @@ -64,7 +84,7 @@ def __init__(self, backend, corpus, metric): ) ] - def _prepare(self, n_jobs=1): + def _prepare(self, n_jobs: int = 1) -> None: self._gold_batches = [] self._source_batches = [] @@ -89,16 +109,16 @@ def _prepare(self, n_jobs=1): self._source_batches.append(suggestions) self._gold_batches.append(gold_batch) - def _normalize(self, hps): + def _normalize(self, hps: Dict[str, float]) -> Dict[str, float]: total = sum(hps.values()) return {source: hps[source] / total for source in hps} - def _format_cfg_line(self, hps): + def _format_cfg_line(self, hps: Dict[str, float]) -> str: return "sources=" + ",".join( [f"{src}:{weight:.4f}" for src, weight in hps.items()] ) - def _objective(self, trial): + def _objective(self, trial: Trial) -> float: eval_batch = annif.eval.EvaluationBatch(self._backend.project.subjects) proj_weights = { project_id: trial.suggest_uniform(project_id, 0.0, 1.0) @@ -114,7 +134,7 @@ def _objective(self, trial): results = eval_batch.results(metrics=[self._metric]) return results[self._metric] - def _postprocess(self, study): + def _postprocess(self, study: Study) -> HPRecommendation: line = self._format_cfg_line(self._normalize(study.best_params)) return hyperopt.HPRecommendation(lines=[line], score=study.best_value) @@ -125,17 +145,21 @@ class EnsembleBackend(BaseEnsembleBackend, hyperopt.AnnifHyperoptBackend): name = "ensemble" @property - def is_trained(self): + def is_trained(self) -> bool: sources_trained = self._get_sources_attribute("is_trained") return all(sources_trained) @property - def modification_time(self): + def modification_time(self) -> None: mtimes = self._get_sources_attribute("modification_time") return max(filter(None, mtimes), default=None) - def get_hp_optimizer(self, corpus, metric): + def get_hp_optimizer( + self, corpus: DocumentDirectory, metric: str + ) -> EnsembleOptimizer: return EnsembleOptimizer(self, corpus, metric) - def _train(self, corpus, params, jobs=0): + def _train( + self, corpus: DocumentFile, params: Dict[str, Union[int, str]], jobs: int = 0 + ): raise NotSupportedException("Training ensemble backend is not possible.") diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index 7b6e9e842..06a233ff2 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -1,7 +1,9 @@ """Annif backend using the fastText classifier""" +from __future__ import annotations import collections import os.path +from typing import TYPE_CHECKING, Dict, List, Tuple, Union import fasttext @@ -11,6 +13,12 @@ from . import backend, mixins +if TYPE_CHECKING: + from fasttext.FastText import _FastText + from numpy import ndarray + + from annif.corpus.document import DocumentFile, TransformingDocumentCorpus + class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): """fastText backend for Annif""" @@ -48,14 +56,14 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): # defaults for uninitialized instances _model = None - def default_params(self): + def default_params(self) -> Dict[str, Union[int, float, str]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(mixins.ChunkingBackend.DEFAULT_PARAMETERS) params.update(self.DEFAULT_PARAMETERS) return params @staticmethod - def _load_model(path): + def _load_model(path: str) -> _FastText: # monkey patch fasttext.FastText.eprint to avoid spurious warning # see https://github.com/facebookresearch/fastText/issues/1067 orig_eprint = fasttext.FastText.eprint @@ -65,7 +73,7 @@ def _load_model(path): fasttext.FastText.eprint = orig_eprint return model - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug("loading fastText model from {}".format(path)) @@ -79,14 +87,16 @@ def initialize(self, parallel=False): ) @staticmethod - def _id_to_label(subject_id): + def _id_to_label(subject_id: int) -> str: return "__label__{:d}".format(subject_id) - def _label_to_subject_id(self, label): + def _label_to_subject_id(self, label: str) -> int: labelnum = label.replace("__label__", "") return int(labelnum) - def _write_train_file(self, corpus, filename): + def _write_train_file( + self, corpus: Union[TransformingDocumentCorpus, DocumentFile], filename: str + ) -> None: with open(filename, "w", encoding="utf-8") as trainfile: for doc in corpus.documents: text = self._normalize_text(doc.text) @@ -98,17 +108,21 @@ def _write_train_file(self, corpus, filename): else: self.warning(f'no labels for document "{doc.text}"') - def _normalize_text(self, text): + def _normalize_text(self, text: str) -> str: return " ".join(self.project.analyzer.tokenize_words(text)) - def _create_train_file(self, corpus): + def _create_train_file( + self, corpus: Union[TransformingDocumentCorpus, DocumentFile] + ) -> None: self.info("creating fastText training file") annif.util.atomic_save( corpus, self.datadir, self.TRAIN_FILE, method=self._write_train_file ) - def _create_model(self, params, jobs): + def _create_model( + self, params: Dict[str, Union[int, float, str]], jobs: int + ) -> None: self.info("creating fastText model") trainpath = os.path.join(self.datadir, self.TRAIN_FILE) modelpath = os.path.join(self.datadir, self.MODEL_FILE) @@ -123,7 +137,12 @@ def _create_model(self, params, jobs): self._model = fasttext.train_supervised(trainpath, **params) self._model.save_model(modelpath) - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[TransformingDocumentCorpus, DocumentFile, str], + params: Dict[str, Union[int, float, str]], + jobs: int = 0, + ) -> None: if corpus != "cached": if corpus.is_empty(): raise NotSupportedException( @@ -134,7 +153,9 @@ def _train(self, corpus, params, jobs=0): self.info("Reusing cached training data from previous run.") self._create_model(params, jobs) - def _predict_chunks(self, chunktexts, limit): + def _predict_chunks( + self, chunktexts: List[str], limit: int + ) -> Tuple[List[List[str]], List[ndarray]]: return self._model.predict( list( filter( @@ -144,7 +165,9 @@ def _predict_chunks(self, chunktexts, limit): limit, ) - def _suggest_chunks(self, chunktexts, params): + def _suggest_chunks( + self, chunktexts: List[str], params: Dict[str, Union[int, float, str]] + ) -> List[SubjectSuggestion]: limit = int(params["limit"]) chunklabels, chunkscores = self._predict_chunks(chunktexts, limit) label_scores = collections.defaultdict(float) diff --git a/annif/backend/http.py b/annif/backend/http.py index a76dbbb6a..8f26abe1b 100644 --- a/annif/backend/http.py +++ b/annif/backend/http.py @@ -1,8 +1,9 @@ """HTTP/REST client backend that makes calls to a web service and returns the results""" - +from __future__ import annotations import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import dateutil.parser import requests @@ -13,13 +14,16 @@ from . import backend +if TYPE_CHECKING: + from datetime import datetime + class HTTPBackend(backend.AnnifBackend): name = "http" _headers = None @property - def headers(self): + def headers(self) -> Dict[str, str]: if self._headers is None: version = importlib.metadata.version("annif") self._headers = { @@ -28,17 +32,17 @@ def headers(self): return self._headers @property - def is_trained(self): + def is_trained(self) -> bool: return self._get_project_info("is_trained") @property - def modification_time(self): + def modification_time(self) -> Optional[datetime]: mtime = self._get_project_info("modification_time") if mtime is None: return None return dateutil.parser.parse(mtime) - def _get_project_info(self, key): + def _get_project_info(self, key: str) -> Optional[Union[bool, str]]: params = self._get_backend_params(None) try: req = requests.get( @@ -59,7 +63,9 @@ def _get_project_info(self, key): else: return None - def _suggest(self, text, params): + def _suggest( + self, text: str, params: Dict[str, Union[int, str]] + ) -> List[Union[Any, SubjectSuggestion]]: data = {"text": text} if "project" in params: data["project"] = params["project"] diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 6954dadc3..da6d1799b 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -1,9 +1,10 @@ """Maui-like Lexical Matching backend""" +from __future__ import annotations import os.path +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Tuple, Union import joblib -import numpy as np import annif.eval import annif.util @@ -13,11 +14,20 @@ from . import backend, hyperopt +if TYPE_CHECKING: + from numpy import float64, ndarray + from optuna.study.study import Study + from optuna.trial._trial import Trial + + from annif.backend.hyperopt import HPRecommendation + from annif.corpus.document import DocumentDirectory, DocumentFile + from annif.lexical.mllm import Candidate + class MLLMOptimizer(hyperopt.HyperparameterOptimizer): """Hyperparameter optimizer for the MLLM backend""" - def _prepare(self, n_jobs=1): + def _prepare(self, n_jobs: int = 1) -> None: self._backend.initialize() self._train_x, self._train_y = self._backend._load_train_data() self._candidates = [] @@ -29,7 +39,7 @@ def _prepare(self, n_jobs=1): self._candidates.append(candidates) self._gold_subjects.append(doc.subject_set) - def _objective(self, trial): + def _objective(self, trial: Trial) -> float: params = { "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 30), "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 100, 2000), @@ -52,7 +62,7 @@ def _objective(self, trial): results = batch.results(metrics=[self._metric]) return results[self._metric] - def _postprocess(self, study): + def _postprocess(self, study: Study) -> HPRecommendation: bp = study.best_params lines = [ f"min_samples_leaf={bp['min_samples_leaf']}", @@ -80,15 +90,15 @@ class MLLMBackend(hyperopt.AnnifHyperoptBackend): "use_hidden_labels": False, } - def get_hp_optimizer(self, corpus, metric): + def get_hp_optimizer(self, corpus: DocumentDirectory, metric: str) -> MLLMOptimizer: return MLLMOptimizer(self, corpus, metric) - def default_params(self): + def default_params(self) -> Dict[str, Union[int, float, bool]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params - def _load_model(self): + def _load_model(self) -> MLLMModel: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug("loading model from {}".format(path)) if os.path.exists(path): @@ -98,7 +108,7 @@ def _load_model(self): "model {} not found".format(path), backend_id=self.backend_id ) - def _load_train_data(self): + def _load_train_data(self) -> Tuple[ndarray, ndarray]: path = os.path.join(self.datadir, self.TRAIN_FILE) if os.path.exists(path): return joblib.load(path) @@ -107,11 +117,16 @@ def _load_train_data(self): "train data file {} not found".format(path), backend_id=self.backend_id ) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: if self._model is None: self._model = self._load_model() - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[DocumentFile, str, DocumentDirectory], + params: Dict[str, Union[int, float, bool, str]], + jobs: int = 0, + ) -> None: self.info("starting train") if corpus != "cached": if corpus.is_empty(): @@ -137,16 +152,22 @@ def _train(self, corpus, params, jobs=0): self.info("saving model") annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE) - def _generate_candidates(self, text): + def _generate_candidates(self, text: str) -> List[Union[Candidate, Any]]: return self._model.generate_candidates(text, self.project.analyzer) - def _prediction_to_result(self, prediction, params): + def _prediction_to_result( + self, + prediction: List[Union[Tuple[float64, int], Any]], + params: Dict[str, Union[int, float, bool, str]], + ) -> Iterator[Any]: vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: vector[subject_id] = score return vector_to_suggestions(vector, int(params["limit"])) - def _suggest(self, text, params): + def _suggest( + self, text: str, params: Dict[str, Union[int, float, bool, str]] + ) -> Iterator[Any]: candidates = self._generate_candidates(text) prediction = self._model.predict(candidates) return self._prediction_to_result(prediction, params) diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 2ee5f89c4..1ea8989fb 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -1,14 +1,14 @@ """Neural network based ensemble backend that combines results from multiple projects.""" - +from __future__ import annotations import os.path import shutil from io import BytesIO +from typing import TYPE_CHECKING, Dict, List, Tuple, Union import joblib import lmdb -import numpy as np import tensorflow.keras.backend as K from scipy.sparse import csc_matrix, csr_matrix from tensorflow.keras.layers import Add, Dense, Dropout, Flatten, Input, Layer @@ -23,13 +23,19 @@ from . import backend, ensemble +if TYPE_CHECKING: + from numpy import ndarray + from tensorflow.python.framework.ops import EagerTensor + + from annif.corpus.document import DocumentFile, LimitingDocumentCorpus + -def idx_to_key(idx): +def idx_to_key(idx: int) -> bytes: """convert an integer index to a binary key for use in LMDB""" return b"%08d" % idx -def key_to_idx(key): +def key_to_idx(key: Union[memoryview, bytes]) -> int: """convert a binary LMDB key to an integer index""" return int(key) @@ -47,7 +53,7 @@ def __init__(self, txn, batch_size): self._counter = 0 self._batch_size = batch_size - def add_sample(self, inputs, targets): + def add_sample(self, inputs: ndarray, targets: ndarray) -> None: # use zero-padded 8-digit key key = idx_to_key(self._counter) self._counter += 1 @@ -58,7 +64,7 @@ def add_sample(self, inputs, targets): buf.seek(0) self._txn.put(key, buf.read()) - def __getitem__(self, idx): + def __getitem__(self, idx: int) -> Tuple[ndarray, ndarray]: """get a particular batch of samples""" cursor = self._txn.cursor() first_key = idx * self._batch_size @@ -73,7 +79,7 @@ def __getitem__(self, idx): target_arrays.append(target_csr.toarray().flatten()) return np.array(input_arrays), np.array(target_arrays) - def __len__(self): + def __len__(self) -> int: """return the number of available batches""" return int(np.ceil(self._counter / self._batch_size)) @@ -81,7 +87,7 @@ def __len__(self): class MeanLayer(Layer): """Custom Keras layer that calculates mean values along the 2nd axis.""" - def call(self, inputs): + def call(self, inputs: EagerTensor) -> EagerTensor: return K.mean(inputs, axis=2) @@ -106,12 +112,12 @@ class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBacke # defaults for uninitialized instances _model = None - def default_params(self): + def default_params(self) -> Dict[str, Union[int, float, str]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: super().initialize(parallel) if self._model is not None: return # already initialized @@ -130,7 +136,12 @@ def initialize(self, parallel=False): model_filename, custom_objects={"MeanLayer": MeanLayer} ) - def _merge_source_batches(self, batch_by_source, sources, params): + def _merge_source_batches( + self, + batch_by_source: Dict[str, SuggestionBatch], + sources: List[Tuple[str, float]], + params: Dict[str, Union[int, float, str]], + ) -> SuggestionBatch: src_weight = dict(sources) score_vectors = np.array( [ @@ -153,7 +164,7 @@ def _merge_source_batches(self, batch_by_source, sources, params): self.project.subjects, ) - def _create_model(self, sources): + def _create_model(self, sources: List[Union[Tuple[str, float], str]]) -> None: self.info("creating NN ensemble model") inputs = Input(shape=(len(self.project.subjects), len(sources))) @@ -185,7 +196,12 @@ def _create_model(self, sources): self._model.summary(print_fn=summary.append) self.debug("Created model: \n" + "\n".join(summary)) - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[DocumentFile, str], + params: Dict[str, Union[int, float, str]], + jobs: int = 0, + ) -> None: sources = annif.util.parse_sources(self.params["sources"]) self._create_model(sources) self._fit_model( @@ -195,7 +211,12 @@ def _train(self, corpus, params, jobs=0): n_jobs=jobs, ) - def _corpus_to_vectors(self, corpus, seq, n_jobs): + def _corpus_to_vectors( + self, + corpus: Union[LimitingDocumentCorpus, DocumentFile], + seq: LMDBSequence, + n_jobs: int, + ) -> None: # pass corpus through all source projects sources = dict(annif.util.parse_sources(self.params["sources"])) @@ -236,7 +257,13 @@ def _open_lmdb(self, cached, lmdb_map_size): shutil.rmtree(lmdb_path) return lmdb.open(lmdb_path, map_size=lmdb_map_size, writemap=True) - def _fit_model(self, corpus, epochs, lmdb_map_size, n_jobs=1): + def _fit_model( + self, + corpus: Union[LimitingDocumentCorpus, DocumentFile, str], + epochs: int, + lmdb_map_size: int, + n_jobs: int = 1, + ) -> None: env = self._open_lmdb(corpus == "cached", lmdb_map_size) if corpus != "cached": if corpus.is_empty(): @@ -256,7 +283,11 @@ def _fit_model(self, corpus, epochs, lmdb_map_size, n_jobs=1): annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE) - def _learn(self, corpus, params): + def _learn( + self, + corpus: Union[LimitingDocumentCorpus, DocumentFile], + params: Dict[str, Union[int, float, str]], + ) -> None: self.initialize() self._fit_model( corpus, int(params["learn-epochs"]), int(params["lmdb_map_size"]) diff --git a/annif/backend/omikuji.py b/annif/backend/omikuji.py index 99218b951..1c2c51645 100644 --- a/annif/backend/omikuji.py +++ b/annif/backend/omikuji.py @@ -1,7 +1,9 @@ """Annif backend using the Omikuji classifier""" +from __future__ import annotations import os.path import shutil +from typing import TYPE_CHECKING, Dict, List, Union import omikuji @@ -15,6 +17,11 @@ from . import backend, mixins +if TYPE_CHECKING: + from scipy.sparse._csr import csr_matrix + + from annif.corpus.document import DocumentFile + class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): """Omikuji based backend for Annif""" @@ -36,12 +43,12 @@ class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): "collapse_every_n_layers": 0, } - def default_params(self): + def default_params(self) -> Dict[str, Union[int, bool]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params - def _initialize_model(self): + def _initialize_model(self) -> None: if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug("loading model from {}".format(path)) @@ -58,11 +65,11 @@ def _initialize_model(self): "model {} not found".format(path), backend_id=self.backend_id ) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: self.initialize_vectorizer() self._initialize_model() - def _create_train_file(self, veccorpus, corpus): + def _create_train_file(self, veccorpus: csr_matrix, corpus: DocumentFile) -> None: self.info("creating train file") path = os.path.join(self.datadir, self.TRAIN_FILE) with open(path, "w", encoding="utf-8") as trainfile: @@ -89,7 +96,7 @@ def _create_train_file(self, veccorpus, corpus): trainfile.seek(0) print("{:08d}".format(n_samples), end="", file=trainfile) - def _create_model(self, params, jobs): + def _create_model(self, params: Dict[str, Union[int, bool]], jobs: int) -> None: train_path = os.path.join(self.datadir, self.TRAIN_FILE) model_path = os.path.join(self.datadir, self.MODEL_FILE) hyper_param = omikuji.Model.default_hyper_param() @@ -104,7 +111,12 @@ def _create_model(self, params, jobs): shutil.rmtree(model_path) self._model.save(os.path.join(self.datadir, self.MODEL_FILE)) - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[DocumentFile, str], + params: Dict[str, Union[int, bool]], + jobs: int = 0, + ) -> None: if corpus != "cached": if corpus.is_empty(): raise NotSupportedException( @@ -122,7 +134,9 @@ def _train(self, corpus, params, jobs=0): self.info("Reusing cached training data from previous run.") self._create_model(params, jobs) - def _suggest_batch(self, texts, params): + def _suggest_batch( + self, texts: List[str], params: Dict[str, Union[int, bool]] + ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) limit = int(params["limit"]) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index 5125cb8cd..125be6aed 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -2,8 +2,10 @@ learns which concept suggestions from each backend are trustworthy using the PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by individual backends into probabilities.""" +from __future__ import annotations import os.path +from typing import TYPE_CHECKING, Dict, List, Tuple, Union import joblib import numpy as np @@ -17,6 +19,12 @@ from . import backend, ensemble +if TYPE_CHECKING: + from scipy.sparse._csc import csc_matrix + + from annif.corpus.document import DocumentFile + from annif.project import AnnifProject + class PAVBackend(ensemble.BaseEnsembleBackend): """PAV ensemble backend that combines results from multiple projects""" @@ -30,12 +38,12 @@ class PAVBackend(ensemble.BaseEnsembleBackend): DEFAULT_PARAMETERS = {"min-docs": 10} - def default_params(self): + def default_params(self) -> Dict[str, int]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: super().initialize(parallel) if self._models is not None: return # already initialized @@ -53,11 +61,16 @@ def initialize(self, parallel=False): backend_id=self.backend_id, ) - def _get_model(self, source_project_id): + def _get_model(self, source_project_id: str) -> Dict[int, IsotonicRegression]: self.initialize() return self._models[source_project_id] - def _merge_source_batches(self, batch_by_source, sources, params): + def _merge_source_batches( + self, + batch_by_source: Dict[str, SuggestionBatch], + sources: List[Tuple[str, float]], + params: Dict[str, Union[int, str]], + ) -> SuggestionBatch: reg_batch_by_source = {} for project_id, batch in batch_by_source.items(): reg_models = self._get_model(project_id) @@ -82,7 +95,9 @@ def _merge_source_batches(self, batch_by_source, sources, params): return super()._merge_source_batches(reg_batch_by_source, sources, params) @staticmethod - def _suggest_train_corpus(source_project, corpus): + def _suggest_train_corpus( + source_project: AnnifProject, corpus: DocumentFile + ) -> Tuple[scipy.sparse._csc.csc_matrix, scipy.sparse._csc.csc_matrix]: # lists for constructing score matrix data, row, col = [], [], [] # lists for constructing true label matrix @@ -114,7 +129,9 @@ def _suggest_train_corpus(source_project, corpus): ) return csc_matrix(scores), csc_matrix(true) - def _create_pav_model(self, source_project_id, min_docs, corpus): + def _create_pav_model( + self, source_project_id: str, min_docs: int, corpus: DocumentFile + ) -> None: self.info( "creating PAV model for source {}, min_docs={}".format( source_project_id, min_docs @@ -138,7 +155,12 @@ def _create_pav_model(self, source_project_id, min_docs, corpus): pav_regressions, self.datadir, model_filename, method=joblib.dump ) - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[str, DocumentFile], + params: Dict[str, Union[int, str]], + jobs: int = 0, + ) -> None: if corpus == "cached": raise NotSupportedException( "Training pav project from cached data not supported." diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index d8217ee03..688a03112 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import os +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from stwfsapy.predictor import StwfsapyPredictor @@ -8,6 +11,9 @@ from . import backend +if TYPE_CHECKING: + from annif.corpus.document import DocumentFile, DocumentList + _KEY_CONCEPT_TYPE_URI = "concept_type_uri" _KEY_SUBTHESAURUS_TYPE_URI = "sub_thesaurus_type_uri" _KEY_THESAURUS_RELATION_TYPE_URI = "thesaurus_relation_type_uri" @@ -59,7 +65,7 @@ class StwfsaBackend(backend.AnnifBackend): _model = None - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug(f"Loading STWFSA model from {path}.") @@ -71,7 +77,9 @@ def initialize(self, parallel=False): f"Model not found at {path}", backend_id=self.backend_id ) - def _load_data(self, corpus): + def _load_data( + self, corpus: Union[DocumentList, DocumentFile, str] + ) -> Tuple[List[str], List[List[Union[str, Any]]]]: if corpus == "cached": raise NotSupportedException( "Training stwfsa project from cached data not supported." @@ -93,7 +101,12 @@ def _load_data(self, corpus): ) return X, y - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[DocumentList, DocumentFile, str], + params: Dict[str, Union[str, bool, int]], + jobs: int = 0, + ) -> None: X, y = self._load_data(corpus) new_params = { key: self.STWFSA_PARAMETERS[key](val) @@ -114,7 +127,9 @@ def _train(self, corpus, params, jobs=0): lambda model, store_path: model.store(store_path), ) - def _suggest(self, text, params): + def _suggest( + self, text: str, params: Dict[str, Union[str, bool, int]] + ) -> List[Union[SubjectSuggestion, Any]]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') result = self._model.suggest_proba([text])[0] suggestions = [] diff --git a/annif/backend/svc.py b/annif/backend/svc.py index ad8939f5f..fbeab02e7 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -1,9 +1,10 @@ """Annif backend using a SVM classifier""" +from __future__ import annotations import os.path +from typing import TYPE_CHECKING, Dict, List, Tuple, Union import joblib -import numpy as np import scipy.special from sklearn.svm import LinearSVC @@ -13,6 +14,12 @@ from . import backend, mixins +if TYPE_CHECKING: + from numpy import ndarray + from scipy.sparse._csr import csr_matrix + + from annif.corpus.document import DocumentFile + class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): """Support vector classifier backend for Annif""" @@ -26,12 +33,12 @@ class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): DEFAULT_PARAMETERS = {"min_df": 1, "ngram": 1} - def default_params(self): + def default_params(self) -> Dict[str, int]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params - def _initialize_model(self): + def _initialize_model(self) -> None: if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug("loading model from {}".format(path)) @@ -42,11 +49,13 @@ def _initialize_model(self): "model {} not found".format(path), backend_id=self.backend_id ) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: self.initialize_vectorizer() self._initialize_model() - def _corpus_to_texts_and_classes(self, corpus): + def _corpus_to_texts_and_classes( + self, corpus: DocumentFile + ) -> Tuple[List[str], List[int]]: texts = [] classes = [] for doc in corpus.documents: @@ -61,7 +70,7 @@ def _corpus_to_texts_and_classes(self, corpus): classes.append(doc.subject_set[0]) return texts, classes - def _train_classifier(self, veccorpus, classes): + def _train_classifier(self, veccorpus: csr_matrix, classes: List[int]) -> None: self.info("creating classifier") self._model = LinearSVC() self._model.fit(veccorpus, classes) @@ -69,7 +78,9 @@ def _train_classifier(self, veccorpus, classes): self._model, self.datadir, self.MODEL_FILE, method=joblib.dump ) - def _train(self, corpus, params, jobs=0): + def _train( + self, corpus: Union[DocumentFile, str], params: Dict[str, int], jobs: int = 0 + ) -> None: if corpus == "cached": raise NotSupportedException( "SVC backend does not support reuse of cached training data." @@ -85,7 +96,9 @@ def _train(self, corpus, params, jobs=0): veccorpus = self.create_vectorizer(texts, vecparams) self._train_classifier(veccorpus, classes) - def _scores_to_suggestions(self, scores, params): + def _scores_to_suggestions( + self, scores: ndarray, params: Dict[str, int] + ) -> List[SubjectSuggestion]: results = [] limit = int(params["limit"]) for class_id in np.argsort(scores)[::-1][:limit]: @@ -96,7 +109,9 @@ def _scores_to_suggestions(self, scores, params): ) return results - def _suggest_batch(self, texts, params): + def _suggest_batch( + self, texts: List[str], params: Dict[str, int] + ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) confidences = self._model.decision_function(vector) # convert to 0..1 score range using logistic function diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 335fe53d1..76d0b5622 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -1,8 +1,10 @@ """Backend that returns most similar subjects based on similarity in sparse TF-IDF normalized bag-of-words vector space""" +from __future__ import annotations import os.path import tempfile +from typing import TYPE_CHECKING, Any, Dict, Iterator, Union import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -13,19 +15,24 @@ from . import backend, mixins +if TYPE_CHECKING: + from scipy.sparse._csr import csr_matrix + + from annif.corpus.document import DocumentFile, TransformingDocumentCorpus + class SubjectBuffer: """A file-backed buffer to store and retrieve subject text.""" BUFFER_SIZE = 100 - def __init__(self, tempdir, subject_id): + def __init__(self, tempdir: str, subject_id: int) -> None: filename = "{:08d}.txt".format(subject_id) self._path = os.path.join(tempdir, filename) self._buffer = [] self._created = False - def flush(self): + def flush(self) -> None: if self._created: mode = "a" else: @@ -38,12 +45,12 @@ def flush(self): self._buffer = [] self._created = True - def write(self, text): + def write(self, text: str) -> None: self._buffer.append(text) if len(self._buffer) >= self.BUFFER_SIZE: self.flush() - def read(self): + def read(self) -> str: if not self._created: # file was never created - we can simply return the buffer content return "\n".join(self._buffer) @@ -62,7 +69,9 @@ class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): INDEX_FILE = "tfidf-index" - def _generate_subjects_from_documents(self, corpus): + def _generate_subjects_from_documents( + self, corpus: Union[TransformingDocumentCorpus, DocumentFile] + ) -> Iterator[str]: with tempfile.TemporaryDirectory() as tempdir: subject_buffer = {} for subject_id in range(len(self.project.subjects)): @@ -76,7 +85,7 @@ def _generate_subjects_from_documents(self, corpus): for sid in range(len(self.project.subjects)): yield subject_buffer[sid].read() - def _initialize_index(self): + def _initialize_index(self) -> None: if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) self.debug("loading similarity index from {}".format(path)) @@ -88,11 +97,11 @@ def _initialize_index(self): backend_id=self.backend_id, ) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: self.initialize_vectorizer() self._initialize_index() - def _create_index(self, veccorpus): + def _create_index(self, veccorpus: csr_matrix) -> None: self.info("creating similarity index") gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( @@ -100,7 +109,12 @@ def _create_index(self, veccorpus): ) annif.util.atomic_save(self._index, self.datadir, self.INDEX_FILE) - def _train(self, corpus, params, jobs=0): + def _train( + self, + corpus: Union[TransformingDocumentCorpus, DocumentFile, str], + params: Dict[str, Union[str, int]], + jobs: int = 0, + ) -> None: if corpus == "cached": raise NotSupportedException( "Training tfidf project from cached data not supported." @@ -112,7 +126,7 @@ def _train(self, corpus, params, jobs=0): veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus) - def _suggest(self, text, params): + def _suggest(self, text: str, params: Dict[str, int]) -> Iterator[Any]: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index bb684aaf5..80da1da7e 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -1,10 +1,10 @@ """Annif backend using Yake keyword extraction""" -# For license remarks of this backend see README.md: -# https://github.com/NatLibFi/Annif#license. +from __future__ import annotations import os.path import re from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple, Union import joblib import yake @@ -16,6 +16,16 @@ from . import backend +# For license remarks of this backend see README.md: +# https://github.com/NatLibFi/Annif#license. + + +if TYPE_CHECKING: + from numpy import float64 + from rdflib.term import URIRef + + from annif.corpus.document import DocumentFile + class YakeBackend(backend.AnnifBackend): """Yake based backend for Annif""" @@ -38,7 +48,7 @@ class YakeBackend(backend.AnnifBackend): "remove_parentheses": False, } - def default_params(self): + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -48,7 +58,7 @@ def is_trained(self): return True @property - def label_types(self): + def label_types(self) -> List[URIRef]: if type(self.params["label_types"]) == str: # Label types set by user label_types = [lt.strip() for lt in self.params["label_types"].split(",")] self._validate_label_types(label_types) @@ -56,17 +66,17 @@ def label_types(self): label_types = self.params["label_types"] # The defaults return [getattr(SKOS, lt) for lt in label_types] - def _validate_label_types(self, label_types): + def _validate_label_types(self, label_types: List[str]) -> None: for lt in label_types: if lt not in ("prefLabel", "altLabel", "hiddenLabel"): raise ConfigurationException( f"invalid label type {lt}", backend_id=self.backend_id ) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: self._initialize_index() - def _initialize_index(self): + def _initialize_index(self) -> None: if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) if os.path.exists(path): @@ -78,12 +88,12 @@ def _initialize_index(self): self._save_index(path) self.info(f"Created index with {len(self._index)} labels") - def _save_index(self, path): + def _save_index(self, path: str) -> None: annif.util.atomic_save( self._index, self.datadir, self.INDEX_FILE, method=joblib.dump ) - def _create_index(self): + def _create_index(self) -> Dict[str, Set[str]]: index = defaultdict(set) skos_vocab = self.project.vocab.skos for concept in skos_vocab.concepts: @@ -95,21 +105,23 @@ def _create_index(self): index.pop("", None) # Remove possible empty string entry return dict(index) - def _normalize_label(self, label): + def _normalize_label(self, label: str) -> str: label = str(label) if annif.util.boolean(self.params["remove_parentheses"]): label = re.sub(r" \(.*\)", "", label) normalized_label = self._normalize_phrase(label) return self._sort_phrase(normalized_label) - def _normalize_phrase(self, phrase): + def _normalize_phrase(self, phrase: str) -> str: return " ".join(self.project.analyzer.tokenize_words(phrase, filter=False)) - def _sort_phrase(self, phrase): + def _sort_phrase(self, phrase: str) -> str: words = phrase.split() return " ".join(sorted(words)) - def _suggest(self, text, params): + def _suggest( + self, text: str, params: Dict[str, Any] + ) -> List[Union[SubjectSuggestion, Any]]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params["limit"]) @@ -132,7 +144,9 @@ def _suggest(self, text, params): ] return subject_suggestions - def _keyphrases2suggestions(self, keyphrases): + def _keyphrases2suggestions( + self, keyphrases: List[Union[Any, Tuple[str, float64]]] + ) -> List[Union[Any, Tuple[str, float64]]]: suggestions = [] not_matched = [] for kp, score in keyphrases: @@ -154,16 +168,18 @@ def _keyphrases2suggestions(self, keyphrases): ) return suggestions - def _keyphrase2uris(self, keyphrase): + def _keyphrase2uris(self, keyphrase: str) -> Set[str]: keyphrase = self._normalize_phrase(keyphrase) keyphrase = self._sort_phrase(keyphrase) return self._index.get(keyphrase, []) - def _transform_score(self, score): + def _transform_score(self, score: float64) -> float64: score = max(score, 0) return 1.0 / (score + 1) - def _combine_suggestions(self, suggestions): + def _combine_suggestions( + self, suggestions: List[Union[Any, Tuple[str, float], Tuple[str, float64]]] + ) -> List[Union[Any, Tuple[str, float], Tuple[str, float64]]]: combined_suggestions = {} for uri, score in suggestions: if uri not in combined_suggestions: @@ -173,12 +189,12 @@ def _combine_suggestions(self, suggestions): combined_suggestions[uri] = self._combine_scores(score, old_score) return list(combined_suggestions.items()) - def _combine_scores(self, score1, score2): + def _combine_scores(self, score1: float, score2: float) -> float: # The result is never smaller than the greater input score1 = score1 / 2 + 0.5 score2 = score2 / 2 + 0.5 confl = score1 * score2 / (score1 * score2 + (1 - score1) * (1 - score2)) return (confl - 0.5) * 2 - def _train(self, corpus, params, jobs=0): + def _train(self, corpus: DocumentFile, params: Dict[str, Any], jobs: int = 0): raise NotSupportedException("Training yake backend is not possible.") diff --git a/annif/cli_util.py b/annif/cli_util.py index 72da0d46c..7e75ff04f 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -1,10 +1,11 @@ """Utility functions for Annif CLI commands""" - +from __future__ import annotations import collections import itertools import os import sys +from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Tuple, Union import click import click_log @@ -14,10 +15,30 @@ from annif.exception import ConfigurationException from annif.project import Access +if TYPE_CHECKING: + from datetime import datetime + from io import TextIOWrapper + + from click.core import Argument, Context, Option + + from annif.corpus.combine import CombinedCorpus + from annif.corpus.document import ( + DocumentDirectory, + DocumentFile, + DocumentList, + LimitingDocumentCorpus, + ) + from annif.corpus.subject import SubjectIndex + from annif.project import AnnifProject + from annif.suggestion import SuggestionResult + from annif.vocab import AnnifVocabulary + logger = annif.logger -def _set_project_config_file_path(ctx, param, value): +def _set_project_config_file_path( + ctx: Context, param: Option, value: Optional[str] +) -> None: """Override the default path or the path given in env by CLI option""" with ctx.obj.load_app().app_context(): if value: @@ -66,7 +87,7 @@ def docs_limit_option(f): )(f) -def get_project(project_id): +def get_project(project_id: str) -> AnnifProject: """ Helper function to get a project by ID and bail out if it doesn't exist""" try: @@ -76,7 +97,7 @@ def get_project(project_id): sys.exit(1) -def get_vocab(vocab_id): +def get_vocab(vocab_id: str) -> AnnifVocabulary: """ Helper function to get a vocabulary by ID and bail out if it doesn't exist""" @@ -87,7 +108,7 @@ def get_vocab(vocab_id): sys.exit(1) -def make_list_template(*rows): +def make_list_template(*rows) -> str: """Helper function to create a template for a list of entries with fields of variable width. The width of each field is determined by the longest item in the field in the given rows.""" @@ -105,14 +126,19 @@ def make_list_template(*rows): ) -def format_datetime(dt): +def format_datetime(dt: Optional[datetime]) -> str: """Helper function to format a datetime object as a string in the local time.""" if dt is None: return "-" return dt.astimezone().strftime("%Y-%m-%d %H:%M:%S") -def open_documents(paths, subject_index, vocab_lang, docs_limit): +def open_documents( + paths: Union[Tuple[str], Tuple[str, str], Tuple[()]], + subject_index: SubjectIndex, + vocab_lang: str, + docs_limit: Optional[int], +) -> Union[LimitingDocumentCorpus, DocumentDirectory, CombinedCorpus, DocumentFile]: """Helper function to open a document corpus from a list of pathnames, each of which is either a TSV file or a directory of TXT files. For directories with subjects in TSV files, the given vocabulary language @@ -140,7 +166,9 @@ def open_doc_path(path, subject_index): return docs -def open_text_documents(paths, docs_limit): +def open_text_documents( + paths: Union[Tuple[str], Tuple[str, str]], docs_limit: Optional[int] +) -> DocumentList: """ Helper function to read text documents from the given file paths. Returns a DocumentList object with Documents having no subjects. If a path is "-", the @@ -160,7 +188,12 @@ def _docs(paths): return annif.corpus.DocumentList(_docs(paths[:docs_limit])) -def show_hits(hits, project, lang, file=None): +def show_hits( + hits: SuggestionResult, + project: AnnifProject, + lang: str, + file: Optional[TextIOWrapper] = None, +) -> None: """ Print subject suggestions to the console or a file. The suggestions are displayed as a table, with one row per hit. Each row contains the URI, label, possible notation, @@ -177,7 +210,9 @@ def show_hits(hits, project, lang, file=None): click.echo(line, file=file) -def parse_backend_params(backend_param, project): +def parse_backend_params( + backend_param: Union[Tuple[str], Tuple[()]], project: AnnifProject +) -> DefaultDict[str, Dict[str, str]]: """Parse a list of backend parameters given with the --backend-param option into a nested dict structure""" backend_params = collections.defaultdict(dict) @@ -189,7 +224,7 @@ def parse_backend_params(backend_param, project): return backend_params -def _validate_backend_params(backend, beparam, project): +def _validate_backend_params(backend: str, beparam: str, project: AnnifProject) -> None: if backend != project.config["backend"]: raise ConfigurationException( 'The backend {} in CLI option "-b {}" not matching the project' @@ -197,13 +232,15 @@ def _validate_backend_params(backend, beparam, project): ) -def generate_filter_params(filter_batch_max_limit): +def generate_filter_params(filter_batch_max_limit: int) -> List[Tuple[int, float]]: limits = range(1, filter_batch_max_limit + 1) thresholds = [i * 0.05 for i in range(20)] return list(itertools.product(limits, thresholds)) -def _get_completion_choices(param): +def _get_completion_choices( + param: Argument, +) -> Dict[str, Union[AnnifVocabulary, AnnifProject]]: if param.name == "project_id": return annif.registry.get_projects() elif param.name == "vocab_id": @@ -212,7 +249,7 @@ def _get_completion_choices(param): return [] -def complete_param(ctx, param, incomplete): +def complete_param(ctx: Context, param: Argument, incomplete: str) -> List[str]: with ctx.obj.load_app().app_context(): return [ choice diff --git a/annif/config.py b/annif/config.py index 589b337a3..5ff51b25e 100644 --- a/annif/config.py +++ b/annif/config.py @@ -1,9 +1,9 @@ """Configuration file handling""" +from __future__ import annotations - -import configparser import os.path from glob import glob +from typing import TYPE_CHECKING, Dict, List, Optional, Union import tomli @@ -11,13 +11,16 @@ import annif.util from annif.exception import ConfigurationException +if TYPE_CHECKING: + from configparser import SectionProxy + logger = annif.logger class AnnifConfigCFG: """Class for reading configuration in CFG/INI format""" - def __init__(self, filename): + def __init__(self, filename: str) -> None: self._config = configparser.ConfigParser() self._config.optionxform = annif.util.identity with open(filename, encoding="utf-8-sig") as projf: @@ -31,17 +34,17 @@ def __init__(self, filename): raise ConfigurationException(err) @property - def project_ids(self): + def project_ids(self) -> List[str]: return self._config.sections() - def __getitem__(self, key): + def __getitem__(self, key: str) -> configparser.SectionProxy: return self._config[key] class AnnifConfigTOML: """Class for reading configuration in TOML format""" - def __init__(self, filename): + def __init__(self, filename: str) -> None: with open(filename, "rb") as projf: try: logger.debug(f"Reading configuration file {filename} in TOML format") @@ -55,14 +58,14 @@ def __init__(self, filename): def project_ids(self): return self._config.keys() - def __getitem__(self, key): + def __getitem__(self, key: str) -> Dict[str, str]: return self._config[key] class AnnifConfigDirectory: """Class for reading configuration from directory""" - def __init__(self, directory): + def __init__(self, directory: str) -> None: files = glob(os.path.join(directory, "*.cfg")) files.extend(glob(os.path.join(directory, "*.toml"))) logger.debug(f"Reading configuration files in directory {directory}") @@ -74,7 +77,7 @@ def __init__(self, directory): self._check_duplicate_project_ids(proj_id, file) self._config[proj_id] = source_config[proj_id] - def _check_duplicate_project_ids(self, proj_id, file): + def _check_duplicate_project_ids(self, proj_id: str, file: str) -> None: if proj_id in self._config: # Error message resembles configparser's DuplicateSection message raise ConfigurationException( @@ -86,11 +89,11 @@ def _check_duplicate_project_ids(self, proj_id, file): def project_ids(self): return self._config.keys() - def __getitem__(self, key): + def __getitem__(self, key: str) -> Union[Dict[str, str], configparser.SectionProxy]: return self._config[key] -def check_config(projects_config_path): +def check_config(projects_config_path: str) -> Optional[str]: if os.path.exists(projects_config_path): return projects_config_path else: @@ -104,7 +107,7 @@ def check_config(projects_config_path): return None -def find_config(): +def find_config() -> Optional[str]: for path in ("projects.cfg", "projects.toml", "projects.d"): if os.path.exists(path): return path @@ -119,7 +122,9 @@ def find_config(): return None -def parse_config(projects_config_path): +def parse_config( + projects_config_path: str, +) -> Optional[Union[AnnifConfigDirectory, AnnifConfigCFG, AnnifConfigTOML]]: if projects_config_path: projects_config_path = check_config(projects_config_path) else: diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index 48fc83ff5..90bbf74d6 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,19 +1,25 @@ """Class for combining multiple corpora so they behave like a single corpus""" +from __future__ import annotations -import itertools +from typing import TYPE_CHECKING, List from .types import DocumentCorpus +if TYPE_CHECKING: + from itertools import chain + + from annif.corpus.document import DocumentFile + class CombinedCorpus(DocumentCorpus): """Class for combining multiple corpora so they behave like a single corpus""" - def __init__(self, corpora): + def __init__(self, corpora: List[DocumentFile]) -> None: self._corpora = corpora @property - def documents(self): + def documents(self) -> itertools.chain: return itertools.chain.from_iterable( [corpus.documents for corpus in self._corpora] ) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index f29eee32d..2d9ad6fc0 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -1,8 +1,20 @@ """Support for subjects loaded from a SKOS/RDF file""" +from __future__ import annotations import collections import os.path import shutil +from typing import ( + TYPE_CHECKING, + Any, + DefaultDict, + Dict, + Iterator, + List, + Set, + Tuple, + Union, +) import rdflib import rdflib.util @@ -12,8 +24,13 @@ from .types import Subject, SubjectCorpus +if TYPE_CHECKING: + from rdflib.term import URIRef -def serialize_subjects_to_skos(subjects, path): + from annif.corpus.types import Subject + + +def serialize_subjects_to_skos(subjects: Iterator[Any], path: str) -> None: """Create a SKOS representation of the given subjects and serialize it into a SKOS/Turtle file with the given path name.""" import joblib @@ -51,7 +68,7 @@ class SubjectFileSKOS(SubjectCorpus): _languages = None - def __init__(self, path): + def __init__(self, path: str) -> None: self.path = path if path.endswith(".dump.gz"): import joblib @@ -62,7 +79,7 @@ def __init__(self, path): self.graph.parse(self.path, format=rdflib.util.guess_format(self.path)) @property - def languages(self): + def languages(self) -> Set[str]: if self._languages is None: self._languages = { label.language @@ -73,7 +90,7 @@ def languages(self): } return self._languages - def _concept_labels(self, concept): + def _concept_labels(self, concept: URIRef) -> Dict[str, str]: by_lang = self.get_concept_labels(concept, self.PREF_LABEL_PROPERTIES) return { lang: by_lang[lang][0] @@ -85,7 +102,7 @@ def _concept_labels(self, concept): } @property - def subjects(self): + def subjects(self) -> Iterator[Subject]: for concept in self.concepts: labels = self._concept_labels(concept) @@ -96,13 +113,15 @@ def subjects(self): yield Subject(uri=str(concept), labels=labels, notation=notation) @property - def concepts(self): + def concepts(self) -> Iterator[URIRef]: for concept in self.graph.subjects(RDF.type, SKOS.Concept): if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: continue yield concept - def get_concept_labels(self, concept, label_types): + def get_concept_labels( + self, concept: URIRef, label_types: Union[Tuple[URIRef, URIRef], List[URIRef]] + ) -> Union[DefaultDict[str, List[str]], DefaultDict[None, List[str]]]: """return all the labels of the given concept with the given label properties as a dict-like object where the keys are language codes and the values are lists of labels in that language""" @@ -115,14 +134,14 @@ def get_concept_labels(self, concept, label_types): return labels_by_lang @staticmethod - def is_rdf_file(path): + def is_rdf_file(path: str) -> bool: """return True if the path looks like an RDF file that can be loaded as SKOS""" fmt = rdflib.util.guess_format(path) return fmt is not None - def save_skos(self, path): + def save_skos(self, path: str) -> None: """Save the contents of the subject vocabulary into a SKOS/Turtle file with the given path name.""" @@ -139,5 +158,5 @@ def save_skos(self, path): annif.util.atomic_save( self.graph, *os.path.split(path.replace(".ttl", ".dump.gz")), - method=joblib.dump + method=joblib.dump, ) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 06c33683b..cd8c08bdb 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -1,7 +1,9 @@ """Classes for supporting subject corpora expressed as directories or files""" +from __future__ import annotations import csv import os.path +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union import annif import annif.util @@ -9,6 +11,12 @@ from .skos import serialize_subjects_to_skos from .types import Subject, SubjectCorpus +if TYPE_CHECKING: + from numpy import int32, ndarray + + from annif.corpus.skos import SubjectFileSKOS + from annif.corpus.types import Subject + logger = annif.logger.getChild("subject") logger.addFilter(annif.util.DuplicateFilter()) @@ -16,14 +24,14 @@ class SubjectFileTSV(SubjectCorpus): """A monolingual subject vocabulary stored in a TSV file.""" - def __init__(self, path, language): + def __init__(self, path: str, language: str) -> None: """initialize the SubjectFileTSV given a path to a TSV file and the language of the vocabulary""" self.path = path self.language = language - def _parse_line(self, line): + def _parse_line(self, line: str) -> Iterator[Subject]: vals = line.strip().split("\t", 2) clean_uri = annif.util.cleanup_uri(vals[0]) label = vals[1] if len(vals) >= 2 else None @@ -32,16 +40,16 @@ def _parse_line(self, line): yield Subject(uri=clean_uri, labels=labels, notation=notation) @property - def languages(self): + def languages(self) -> List[str]: return [self.language] @property - def subjects(self): + def subjects(self) -> None: with open(self.path, encoding="utf-8-sig") as subjfile: for line in subjfile: yield from self._parse_line(line) - def save_skos(self, path): + def save_skos(self, path: str) -> None: """Save the contents of the subject vocabulary into a SKOS/Turtle file with the given path name.""" serialize_subjects_to_skos(self.subjects, path) @@ -50,11 +58,11 @@ def save_skos(self, path): class SubjectFileCSV(SubjectCorpus): """A multilingual subject vocabulary stored in a CSV file.""" - def __init__(self, path): + def __init__(self, path: str) -> None: """initialize the SubjectFileCSV given a path to a CSV file""" self.path = path - def _parse_row(self, row): + def _parse_row(self, row: Dict[str, str]) -> Iterator[Subject]: labels = { fname.replace("label_", ""): value or None for fname, value in row.items() @@ -73,7 +81,7 @@ def _parse_row(self, row): ) @property - def languages(self): + def languages(self) -> List[str]: # infer the supported languages from the CSV column names with open(self.path, encoding="utf-8-sig") as csvfile: reader = csv.reader(csvfile) @@ -86,19 +94,19 @@ def languages(self): ] @property - def subjects(self): + def subjects(self) -> None: with open(self.path, encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile) for row in reader: yield from self._parse_row(row) - def save_skos(self, path): + def save_skos(self, path: str) -> None: """Save the contents of the subject vocabulary into a SKOS/Turtle file with the given path name.""" serialize_subjects_to_skos(self.subjects, path) @staticmethod - def is_csv_file(path): + def is_csv_file(path: str) -> bool: """return True if the path looks like a CSV file""" return os.path.splitext(path)[1].lower() == ".csv" @@ -108,30 +116,32 @@ class SubjectIndex: """An index that remembers the associations between integers subject IDs and their URIs and labels.""" - def __init__(self): + def __init__(self) -> None: self._subjects = [] self._uri_idx = {} self._label_idx = {} self._languages = None - def load_subjects(self, corpus): + def load_subjects( + self, corpus: Union[SubjectFileSKOS, SubjectFileCSV, SubjectFileTSV] + ) -> None: """Initialize the subject index from a subject corpus""" self._languages = corpus.languages for subject in corpus.subjects: self.append(subject) - def __len__(self): + def __len__(self) -> int: return len(self._subjects) @property - def languages(self): + def languages(self) -> List[str]: return self._languages - def __getitem__(self, subject_id): + def __getitem__(self, subject_id: Union[int, int32]) -> Subject: return self._subjects[subject_id] - def append(self, subject): + def append(self, subject: Subject) -> None: if self._languages is None and subject.labels is not None: self._languages = list(subject.labels.keys()) @@ -142,10 +152,10 @@ def append(self, subject): self._label_idx[(label, lang)] = subject_id self._subjects.append(subject) - def contains_uri(self, uri): + def contains_uri(self, uri: str) -> bool: return uri in self._uri_idx - def by_uri(self, uri, warnings=True): + def by_uri(self, uri: str, warnings: bool = True) -> Optional[int]: """return the subject ID of a subject by its URI, or None if not found. If warnings=True, log a warning message if the URI cannot be found.""" try: @@ -155,7 +165,7 @@ def by_uri(self, uri, warnings=True): logger.warning("Unknown subject URI <%s>", uri) return None - def by_label(self, label, language): + def by_label(self, label: Optional[str], language: str) -> Optional[int]: """return the subject ID of a subject by its label in a given language""" try: @@ -164,7 +174,7 @@ def by_label(self, label, language): logger.warning('Unknown subject label "%s"@%s', label, language) return None - def deprecated_ids(self): + def deprecated_ids(self) -> List[Union[Any, int]]: """return indices of deprecated subjects""" return [ @@ -174,7 +184,7 @@ def deprecated_ids(self): ] @property - def active(self): + def active(self) -> List[Tuple[int, Subject]]: """return a list of (subject_id, subject) tuples of all subjects that are not deprecated""" @@ -184,7 +194,7 @@ def active(self): if subject.labels is not None ] - def save(self, path): + def save(self, path: str) -> None: """Save this subject index into a file with the given path name.""" fieldnames = ["uri", "notation"] + [f"label_{lang}" for lang in self._languages] @@ -200,7 +210,7 @@ def save(self, path): writer.writerow(row) @classmethod - def load(cls, path): + def load(cls, path: str) -> "SubjectIndex": """Load a subject index from a CSV file and return it.""" corpus = SubjectFileCSV(path) @@ -212,7 +222,7 @@ def load(cls, path): class SubjectSet: """Represents a set of subjects for a document.""" - def __init__(self, subject_ids=None): + def __init__(self, subject_ids: Optional[Any] = None) -> None: """Create a SubjectSet and optionally initialize it from an iterable of subject IDs""" @@ -224,23 +234,25 @@ def __init__(self, subject_ids=None): else: self._subject_ids = [] - def __len__(self): + def __len__(self) -> int: return len(self._subject_ids) - def __getitem__(self, idx): + def __getitem__(self, idx: int) -> int: return self._subject_ids[idx] - def __bool__(self): + def __bool__(self) -> bool: return bool(self._subject_ids) - def __eq__(self, other): + def __eq__(self, other: Union[SubjectSet, List[int], Set[int]]) -> bool: if isinstance(other, SubjectSet): return self._subject_ids == other._subject_ids return False @classmethod - def from_string(cls, subj_data, subject_index, language): + def from_string( + cls, subj_data: str, subject_index: SubjectIndex, language: str + ) -> "SubjectSet": subject_ids = set() for line in subj_data.splitlines(): uri, label = cls._parse_line(line) @@ -251,7 +263,9 @@ def from_string(cls, subj_data, subject_index, language): return cls(subject_ids) @staticmethod - def _parse_line(line): + def _parse_line( + line: str, + ) -> Union[Tuple[None, None], Tuple[str, str], Tuple[None, str]]: uri = label = None vals = line.split("\t") for val in vals: @@ -265,14 +279,14 @@ def _parse_line(line): break return uri, label - def as_vector(self, size=None, destination=None): + def as_vector( + self, size: Optional[int] = None, destination: Optional[ndarray] = None + ) -> ndarray: """Return the hits as a one-dimensional NumPy array in sklearn multilabel indicator format. Use destination array if given (not None), otherwise create and return a new one of the given size.""" if destination is None: - import numpy as np - assert size is not None and size > 0 destination = np.zeros(size, dtype=bool) diff --git a/annif/corpus/types.py b/annif/corpus/types.py index fb607fdc7..3a4a7e02a 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -1,8 +1,10 @@ """Basic types for document and subject corpora""" +from __future__ import annotations import abc import collections from itertools import islice +from typing import TYPE_CHECKING, Iterator, List Document = collections.namedtuple("Document", "text subject_set") @@ -19,7 +21,7 @@ def documents(self): pass # pragma: no cover @property - def doc_batches(self): + def doc_batches(self) -> Iterator[List[Document]]: """Iterate through the document corpus in batches, yielding lists of Document objects.""" it = iter(self.documents) @@ -29,7 +31,7 @@ def doc_batches(self): return yield docs_batch - def is_empty(self): + def is_empty(self) -> bool: """Check if there are no documents to iterate.""" try: next(self.documents) diff --git a/annif/datadir.py b/annif/datadir.py index 314f685b1..84ea61fe9 100644 --- a/annif/datadir.py +++ b/annif/datadir.py @@ -1,17 +1,19 @@ """Mixin class for types that need a data directory""" +from __future__ import annotations import os import os.path +from typing import TYPE_CHECKING class DatadirMixin: """Mixin class for types that need a data directory for storing files""" - def __init__(self, datadir, typename, identifier): + def __init__(self, datadir: str, typename: str, identifier: str) -> None: self._datadir_path = os.path.join(datadir, typename, identifier) @property - def datadir(self): + def datadir(self) -> str: if not os.path.exists(self._datadir_path): try: os.makedirs(self._datadir_path) diff --git a/annif/eval.py b/annif/eval.py index 264bcad43..aa7ba8870 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -1,34 +1,47 @@ """Evaluation metrics for Annif""" +from __future__ import annotations import warnings +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union -import numpy as np import scipy.sparse from sklearn.metrics import f1_score, precision_score, recall_score from annif.exception import NotSupportedException from annif.suggestion import SuggestionBatch, filter_suggestion +if TYPE_CHECKING: + from io import TextIOWrapper -def true_positives(y_true, y_pred): + from click.utils import LazyFile + from numpy import float64 + from scipy.sparse._arrays import csr_array + + from annif.corpus.subject import SubjectIndex, SubjectSet + from annif.suggestion import SubjectSuggestion + + +def true_positives(y_true: csr_array, y_pred: csr_array) -> int: """calculate the number of true positives using bitwise operations, emulating the way sklearn evaluation metric functions work""" return int((y_true.multiply(y_pred)).sum()) -def false_positives(y_true, y_pred): +def false_positives(y_true: csr_array, y_pred: csr_array) -> int: """calculate the number of false positives using bitwise operations, emulating the way sklearn evaluation metric functions work""" return int((y_true < y_pred).sum()) -def false_negatives(y_true, y_pred): +def false_negatives(y_true: csr_array, y_pred: csr_array) -> int: """calculate the number of false negatives using bitwise operations, emulating the way sklearn evaluation metric functions work""" return int((y_true > y_pred).sum()) -def dcg_score(y_true, y_pred, limit=None): +def dcg_score( + y_true: csr_array, y_pred: csr_array, limit: Optional[int] = None +) -> float64: """return the discounted cumulative gain (DCG) score for the selected labels vs. relevant labels""" @@ -43,7 +56,9 @@ def dcg_score(y_true, y_pred, limit=None): return (gain / discount).sum() -def ndcg_score(y_true, y_pred, limit=None): +def ndcg_score( + y_true: csr_array, y_pred: csr_array, limit: Optional[int] = None +) -> float: """return the normalized discounted cumulative gain (nDCG) score for the selected labels vs. relevant labels""" @@ -65,12 +80,57 @@ class EvaluationBatch: for a list of documents of the batch. Final results can be queried using the results() method.""" - def __init__(self, subject_index): + def __init__(self, subject_index: SubjectIndex) -> None: self._subject_index = subject_index self._suggestion_arrays = [] self._gold_subject_arrays = [] - def evaluate_many(self, suggestion_batch, gold_subject_batch): + def evaluate_many( + self, + suggestion_batch: Union[ + List[List[SubjectSuggestion]], SuggestionBatch, List[Iterator[Any]] + ], + gold_subject_batch: Union[ + Tuple[SubjectSet, SubjectSet, SubjectSet], + Tuple[SubjectSet, SubjectSet, SubjectSet, SubjectSet], + Tuple[SubjectSet, SubjectSet], + Tuple[ + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + SubjectSet, + ], + List[SubjectSet], + ], + ) -> None: if not isinstance(suggestion_batch, SuggestionBatch): suggestion_batch = SuggestionBatch.from_sequence( suggestion_batch, self._subject_index @@ -86,7 +146,12 @@ def evaluate_many(self, suggestion_batch, gold_subject_batch): ar[idx, subject_id] = True self._gold_subject_arrays.append(ar.tocsr()) - def _evaluate_samples(self, y_true, y_pred, metrics=[]): + def _evaluate_samples( + self, + y_true: csr_array, + y_pred: csr_array, + metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], + ) -> Dict[str, Union[float64, float, int]]: y_pred_binary = y_pred > 0.0 # define the available metrics as lazy lambda functions @@ -156,7 +221,9 @@ def _evaluate_samples(self, y_true, y_pred, metrics=[]): return {metric: all_metrics[metric]() for metric in metrics} - def _result_per_subject_header(self, results_file): + def _result_per_subject_header( + self, results_file: Union[LazyFile, TextIOWrapper] + ) -> None: print( "\t".join( [ @@ -174,11 +241,19 @@ def _result_per_subject_header(self, results_file): file=results_file, ) - def _result_per_subject_body(self, zipped_results, results_file): + def _result_per_subject_body( + self, zipped_results: zip, results_file: Union[LazyFile, TextIOWrapper] + ) -> None: for row in zipped_results: print("\t".join((str(e) for e in row)), file=results_file) - def output_result_per_subject(self, y_true, y_pred, results_file, language): + def output_result_per_subject( + self, + y_true: csr_array, + y_pred: csr_array, + results_file: Union[TextIOWrapper, LazyFile], + language: str, + ) -> None: """Write results per subject (non-aggregated) to outputfile results_file, using labels in the given language""" @@ -208,7 +283,12 @@ def output_result_per_subject(self, y_true, y_pred, results_file, language): self._result_per_subject_header(results_file) self._result_per_subject_body(zipped, results_file) - def results(self, metrics=[], results_file=None, language=None): + def results( + self, + metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], + results_file: Optional[Union[LazyFile, TextIOWrapper]] = None, + language: Optional[str] = None, + ) -> Dict[str, Union[float64, float, int]]: """evaluate a set of selected subjects against a gold standard using different metrics. If metrics is empty, use all available metrics. If results_file (file object) given, write results per subject to it diff --git a/annif/exception.py b/annif/exception.py index efc2d4a3e..8fd5a06dc 100644 --- a/annif/exception.py +++ b/annif/exception.py @@ -1,15 +1,26 @@ """Custom exceptions used by Annif""" +from __future__ import annotations +from typing import TYPE_CHECKING, Optional, Union from click import ClickException +if TYPE_CHECKING: + from configparser import DuplicateSectionError + from unittest.mock import Mock + class AnnifException(ClickException): """Base Annif exception. We define this as a subclass of ClickException so that the CLI can automatically handle exceptions. This exception cannot be instantiated directly - subclasses should be used instead.""" - def __init__(self, message, project_id=None, backend_id=None): + def __init__( + self, + message: Union[DuplicateSectionError, str], + project_id: Optional[Union[Mock, str]] = None, + backend_id: Optional[str] = None, + ) -> None: super().__init__(message) self.project_id = project_id self.backend_id = backend_id @@ -20,7 +31,7 @@ def __init__(self, message, project_id=None, backend_id=None): # subclasses should set this to a descriptive prefix prefix = None - def format_message(self): + def format_message(self) -> str: if self.project_id is not None: return "{} project '{}': {}".format( self.prefix, self.project_id, self.message diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 8c9b59f79..40af32774 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -1,12 +1,13 @@ """MLLM (Maui-like Lexical Matchin) model for Annif""" +from __future__ import annotations import collections import math from enum import IntEnum from statistics import mean +from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Tuple, Union import joblib -import numpy as np from rdflib.namespace import SKOS from sklearn.ensemble import BaggingClassifier from sklearn.feature_extraction.text import CountVectorizer @@ -22,6 +23,16 @@ make_relation_matrix, ) +if TYPE_CHECKING: + from numpy import float64, ndarray + from rdflib.graph import Graph + from rdflib.term import URIRef + from sklearn.ensemble._bagging import BaggingClassifier + + from annif.analyzer.snowball import SnowballAnalyzer + from annif.corpus.document import DocumentDirectory + from annif.vocab import AnnifVocabulary + Term = collections.namedtuple("Term", "subject_id label is_pref") Match = collections.namedtuple("Match", "subject_id is_pref n_tokens pos ambiguity") @@ -45,7 +56,9 @@ ) -def conflate_matches(matches, doc_length): +def conflate_matches( + matches: List[Union[Any, Match]], doc_length: int +) -> List[Union[Candidate, Any]]: subj_matches = collections.defaultdict(list) for match in matches: subj_matches[match.subject_id].append(match) @@ -65,7 +78,12 @@ def conflate_matches(matches, doc_length): ] -def generate_candidates(text, analyzer, vectorizer, index): +def generate_candidates( + text: str, + analyzer: SnowballAnalyzer, + vectorizer: CountVectorizer, + index: TokenSetIndex, +) -> List[Union[Candidate, Any]]: sentences = analyzer.tokenize_sentences(text) sent_tokens = vectorizer.transform(sentences) matches = [] @@ -86,7 +104,7 @@ def generate_candidates(text, analyzer, vectorizer, index): return conflate_matches(matches, len(sentences)) -def candidates_to_features(candidates, mdata): +def candidates_to_features(candidates: List[Candidate], mdata: "ModelData") -> ndarray: """Convert a list of Candidates to a NumPy feature matrix""" matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32) @@ -133,11 +151,13 @@ def candidates_to_features(cls, candidates): class MLLMModel: """Maui-like Lexical Matching model""" - def generate_candidates(self, text, analyzer): + def generate_candidates( + self, text: str, analyzer: SnowballAnalyzer + ) -> List[Union[Candidate, Any]]: return generate_candidates(text, analyzer, self._vectorizer, self._index) @property - def _model_data(self): + def _model_data(self) -> "ModelData": return ModelData( broader=self._broader_matrix, narrower=self._narrower_matrix, @@ -148,11 +168,13 @@ def _model_data(self): idf=self._idf, ) - def _candidates_to_features(self, candidates): + def _candidates_to_features(self, candidates: List[Candidate]) -> ndarray: return candidates_to_features(candidates, self._model_data) @staticmethod - def _get_label_props(params): + def _get_label_props( + params: Dict[str, Union[int, float, bool, str]] + ) -> Tuple[List[URIRef], List[URIRef]]: pref_label_props = [SKOS.prefLabel] if annif.util.boolean(params["use_hidden_labels"]): @@ -162,7 +184,12 @@ def _get_label_props(params): return (pref_label_props, nonpref_label_props) - def _prepare_terms(self, graph, vocab, params): + def _prepare_terms( + self, + graph: Graph, + vocab: AnnifVocabulary, + params: Dict[str, Union[int, float, bool, str]], + ) -> Tuple[List[Term], List[int]]: pref_label_props, nonpref_label_props = self._get_label_props(params) terms = [] @@ -182,13 +209,18 @@ def _prepare_terms(self, graph, vocab, params): return (terms, subject_ids) - def _prepare_relations(self, graph, vocab): + def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None: self._broader_matrix = make_relation_matrix(graph, vocab, SKOS.broader) self._narrower_matrix = make_relation_matrix(graph, vocab, SKOS.narrower) self._related_matrix = make_relation_matrix(graph, vocab, SKOS.related) self._collection_matrix = make_collection_matrix(graph, vocab) - def _prepare_train_index(self, vocab, analyzer, params): + def _prepare_train_index( + self, + vocab: AnnifVocabulary, + analyzer: SnowballAnalyzer, + params: Dict[str, Union[int, float, bool, str]], + ) -> List[int]: graph = vocab.as_graph() terms, subject_ids = self._prepare_terms(graph, vocab, params) self._prepare_relations(graph, vocab) @@ -211,7 +243,9 @@ def _prepare_train_index(self, vocab, analyzer, params): return subject_ids - def _prepare_train_data(self, corpus, analyzer, n_jobs): + def _prepare_train_data( + self, corpus: DocumentDirectory, analyzer: SnowballAnalyzer, n_jobs: int + ) -> Tuple[List[List[Union[Candidate, Any]]], List[bool]]: # frequency of subjects (by id) in the generated candidates self._doc_freq = collections.Counter() # frequency of manually assigned subjects ("domain keyphraseness") @@ -241,14 +275,18 @@ def _prepare_train_data(self, corpus, analyzer, n_jobs): return (train_x, train_y) - def _calculate_idf(self, subject_ids, doc_count): + def _calculate_idf( + self, subject_ids: List[int], doc_count: int + ) -> DefaultDict[int, float]: idf = collections.defaultdict(float) for subj_id in subject_ids: idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1 return idf - def _prepare_features(self, train_x, n_jobs): + def _prepare_features( + self, train_x: List[List[Union[Candidate, Any]]], n_jobs: int + ) -> List[ndarray]: fc_args = {"mdata": self._model_data} jobs, pool_class = annif.parallel.get_pool(n_jobs) @@ -261,7 +299,14 @@ def _prepare_features(self, train_x, n_jobs): return features - def prepare_train(self, corpus, vocab, analyzer, params, n_jobs): + def prepare_train( + self, + corpus: DocumentDirectory, + vocab: AnnifVocabulary, + analyzer: SnowballAnalyzer, + params: Dict[str, Union[int, float, bool, str]], + n_jobs: int, + ) -> Tuple[ndarray, ndarray]: # create an index from the vocabulary terms subject_ids = self._prepare_train_index(vocab, analyzer, params) @@ -276,7 +321,9 @@ def prepare_train(self, corpus, vocab, analyzer, params, n_jobs): return (np.vstack(features), np.array(train_y)) - def _create_classifier(self, params): + def _create_classifier( + self, params: Dict[str, Union[int, float, bool, str]] + ) -> sklearn.ensemble._bagging.BaggingClassifier: return BaggingClassifier( DecisionTreeClassifier( min_samples_leaf=int(params["min_samples_leaf"]), @@ -285,7 +332,12 @@ def _create_classifier(self, params): max_samples=float(params["max_samples"]), ) - def train(self, train_x, train_y, params): + def train( + self, + train_x: Union[ndarray, List[Tuple[int, int]]], + train_y: Union[List[bool], ndarray], + params: Dict[str, Union[int, float, bool, str]], + ) -> None: # fit the model on the training corpus self._classifier = self._create_classifier(params) self._classifier.fit(train_x, train_y) @@ -298,20 +350,24 @@ def train(self, train_x, train_y, params): + "data matches your vocabulary." ) - def _prediction_to_list(self, scores, candidates): + def _prediction_to_list( + self, scores: ndarray, candidates: List[Candidate] + ) -> List[Tuple[float64, int]]: subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)] return sorted(subj_scores, reverse=True) - def predict(self, candidates): + def predict( + self, candidates: List[Union[Candidate, Any]] + ) -> List[Union[Any, Tuple[float64, int]]]: if not candidates: return [] features = self._candidates_to_features(candidates) scores = self._classifier.predict_proba(features) return self._prediction_to_list(scores, candidates) - def save(self, filename): + def save(self, filename: str) -> List[str]: return joblib.dump(self, filename) @staticmethod - def load(filename): + def load(filename: str) -> "MLLMModel": return joblib.load(filename) diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index ebd23e33f..2c5895b26 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -1,6 +1,11 @@ """Index for fast matching of token sets.""" +from __future__ import annotations import collections +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +if TYPE_CHECKING: + from numpy import int32, ndarray class TokenSet: @@ -8,19 +13,24 @@ class TokenSet: be matched with another set of tokens. A TokenSet can optionally be associated with a subject from the vocabulary.""" - def __init__(self, tokens, subject_id=None, is_pref=False): + def __init__( + self, + tokens: Union[List[int32], List[int], ndarray], + subject_id: Optional[int] = None, + is_pref: bool = False, + ) -> None: self._tokens = set(tokens) self.key = tokens[0] if len(tokens) else None self.subject_id = subject_id self.is_pref = is_pref - def __len__(self): + def __len__(self) -> int: return len(self._tokens) def __iter__(self): return iter(self._tokens) - def contains(self, other): + def contains(self, other: "TokenSet") -> bool: """Returns True iff the tokens in the other TokenSet are all included within this TokenSet.""" @@ -30,18 +40,20 @@ def contains(self, other): class TokenSetIndex: """A searchable index of TokenSets (representing vocabulary terms)""" - def __init__(self): + def __init__(self) -> None: self._index = collections.defaultdict(set) - def __len__(self): + def __len__(self) -> int: return len(self._index) - def add(self, tset): + def add(self, tset: TokenSet) -> None: """Add a TokenSet into this index""" if tset.key is not None: self._index[tset.key].add(tset) - def _find_subj_tsets(self, tset): + def _find_subj_tsets( + self, tset: TokenSet + ) -> Union[Dict[Optional[int], TokenSet], Dict[int, TokenSet]]: """return a dict (subject_id : TokenSet) of matches contained in the given TokenSet""" @@ -75,7 +87,7 @@ def _find_subj_ambiguity(self, tsets): return subj_ambiguity - def search(self, tset): + def search(self, tset: TokenSet) -> List[Union[Any, Tuple[TokenSet, int]]]: """Return the TokenSets that are contained in the given TokenSet. The matches are returned as a list of (TokenSet, ambiguity) pairs where ambiguity is an integer indicating the number of other TokenSets diff --git a/annif/lexical/util.py b/annif/lexical/util.py index a6d9931c7..abd0c91e4 100644 --- a/annif/lexical/util.py +++ b/annif/lexical/util.py @@ -1,13 +1,24 @@ """Utility methods for lexical algorithms""" +from __future__ import annotations import collections +from typing import TYPE_CHECKING, Any, List, Union from rdflib import URIRef from rdflib.namespace import SKOS from scipy.sparse import csc_matrix, lil_matrix +if TYPE_CHECKING: + from rdflib.graph import Graph + from rdflib.term import URIRef + from scipy.sparse._csc import csc_matrix -def get_subject_labels(graph, uri, properties, language): + from annif.vocab import AnnifVocabulary + + +def get_subject_labels( + graph: Graph, uri: str, properties: List[rdflib.term.URIRef], language: str +) -> List[Union[Any, str]]: return [ str(label) for prop in properties @@ -16,7 +27,9 @@ def get_subject_labels(graph, uri, properties, language): ] -def make_relation_matrix(graph, vocab, property): +def make_relation_matrix( + graph: Graph, vocab: AnnifVocabulary, property: rdflib.term.URIRef +) -> scipy.sparse._csc.csc_matrix: n_subj = len(vocab.subjects) matrix = lil_matrix((n_subj, n_subj), dtype=bool) @@ -29,7 +42,9 @@ def make_relation_matrix(graph, vocab, property): return csc_matrix(matrix) -def make_collection_matrix(graph, vocab): +def make_collection_matrix( + graph: Graph, vocab: AnnifVocabulary +) -> scipy.sparse._csc.csc_matrix: # make an index with all collection members c_members = collections.defaultdict(list) for coll, member in graph.subject_objects(SKOS.member): diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py index 3799a6126..77d732e64 100644 --- a/annif/openapi/validation.py +++ b/annif/openapi/validation.py @@ -1,6 +1,8 @@ """Custom validator for the Annif API.""" +from __future__ import annotations import logging +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import jsonschema from connexion import decorators @@ -14,10 +16,20 @@ class CustomRequestBodyValidator(decorators.validation.RequestBodyValidator): """Custom request body validator that overrides the default error message for the 'maxItems' validator for the 'documents' property.""" - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - def validate_schema(self, data, url): + def validate_schema( + self, + data: Union[ + List[Dict[str, Union[List[Dict[str, str]], str]]], + List[Dict[str, Optional[List[bool]]]], + Dict[str, List[Any]], + Dict[str, str], + Dict[str, List[Dict[str, str]]], + ], + url: str, + ) -> None: """Validate the request body against the schema.""" if self.is_null_value_valid and is_null(data): diff --git a/annif/project.py b/annif/project.py index b94eaf58e..75345dee2 100644 --- a/annif/project.py +++ b/annif/project.py @@ -1,8 +1,10 @@ """Project management functionality for Annif""" +from __future__ import annotations import enum import os.path from shutil import rmtree +from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Union import annif import annif.analyzer @@ -17,6 +19,32 @@ NotSupportedException, ) +if TYPE_CHECKING: + from configparser import SectionProxy + from datetime import datetime + + from click.utils import LazyFile + + from annif.analyzer.snowball import SnowballAnalyzer + from annif.backend.dummy import DummyBackend + from annif.backend.ensemble import EnsembleBackend + from annif.backend.fasttext import FastTextBackend + from annif.backend.hyperopt import HPRecommendation + from annif.backend.pav import PAVBackend + from annif.backend.tfidf import TFIDFBackend + from annif.corpus.combine import CombinedCorpus + from annif.corpus.document import ( + DocumentDirectory, + DocumentFile, + DocumentList, + LimitingDocumentCorpus, + ) + from annif.corpus.subject import SubjectIndex + from annif.registry import AnnifRegistry + from annif.suggestion import SuggestionBatch, SuggestionResults + from annif.transform.transform import TransformChain + from annif.vocab import AnnifVocabulary + logger = annif.logger @@ -42,7 +70,13 @@ class AnnifProject(DatadirMixin): # default values for configuration settings DEFAULT_ACCESS = "public" - def __init__(self, project_id, config, datadir, registry): + def __init__( + self, + project_id: str, + config: Union[Dict[str, str], SectionProxy], + datadir: str, + registry: AnnifRegistry, + ) -> None: DatadirMixin.__init__(self, datadir, "projects", project_id) self.project_id = project_id self.name = config.get("name", project_id) @@ -55,7 +89,7 @@ def __init__(self, project_id, config, datadir, registry): self.registry = registry self._init_access() - def _init_access(self): + def _init_access(self) -> None: access = self.config.get("access", self.DEFAULT_ACCESS) try: self.access = getattr(Access, access) @@ -65,7 +99,7 @@ def _init_access(self): project_id=self.project_id, ) - def _initialize_analyzer(self): + def _initialize_analyzer(self) -> None: if not self.analyzer_spec: return # not configured, so assume it's not needed analyzer = self.analyzer @@ -73,7 +107,7 @@ def _initialize_analyzer(self): "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer) ) - def _initialize_subjects(self): + def _initialize_subjects(self) -> None: try: subjects = self.subjects logger.debug( @@ -82,7 +116,7 @@ def _initialize_subjects(self): except AnnifException as err: logger.warning(err.format_message()) - def _initialize_backend(self, parallel): + def _initialize_backend(self, parallel: bool) -> None: logger.debug("Project '%s': initializing backend", self.project_id) try: if not self.backend: @@ -92,7 +126,7 @@ def _initialize_backend(self, parallel): except AnnifException as err: logger.warning(err.format_message()) - def initialize(self, parallel=False): + def initialize(self, parallel: bool = False) -> None: """Initialize this project and its backend so that they are ready to be used. If parallel is True, expect that the project will be used for parallel processing.""" @@ -108,14 +142,18 @@ def initialize(self, parallel=False): self.initialized = True - def _suggest_with_backend(self, texts, backend_params): + def _suggest_with_backend( + self, + texts: List[str], + backend_params: Optional[DefaultDict[str, Dict[str, str]]], + ) -> annif.suggestion.SuggestionBatch: if backend_params is None: backend_params = {} beparams = backend_params.get(self.backend.backend_id, {}) return self.backend.suggest(texts, beparams) @property - def analyzer(self): + def analyzer(self) -> SnowballAnalyzer: if self._analyzer is None: if self.analyzer_spec: self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec) @@ -126,7 +164,7 @@ def analyzer(self): return self._analyzer @property - def transform(self): + def transform(self) -> TransformChain: if self._transform is None: self._transform = annif.transform.get_transform( self.transform_spec, project=self @@ -134,7 +172,11 @@ def transform(self): return self._transform @property - def backend(self): + def backend( + self, + ) -> Union[ + DummyBackend, EnsembleBackend, PAVBackend, TFIDFBackend, FastTextBackend + ]: if self._backend is None: if "backend" not in self.config: raise ConfigurationException( @@ -154,7 +196,7 @@ def backend(self): ) return self._backend - def _initialize_vocab(self): + def _initialize_vocab(self) -> None: if self.vocab_spec is None: raise ConfigurationException( "vocab setting is missing", project_id=self.project_id @@ -164,22 +206,22 @@ def _initialize_vocab(self): ) @property - def vocab(self): + def vocab(self) -> AnnifVocabulary: if self._vocab is None: self._initialize_vocab() return self._vocab @property - def vocab_lang(self): + def vocab_lang(self) -> str: if self._vocab_lang is None: self._initialize_vocab() return self._vocab_lang @property - def subjects(self): + def subjects(self) -> SubjectIndex: return self.vocab.subjects - def _get_info(self, key): + def _get_info(self, key: str) -> Optional[Union[bool, datetime]]: try: be = self.backend if be is not None: @@ -189,24 +231,31 @@ def _get_info(self, key): return None @property - def is_trained(self): + def is_trained(self) -> Optional[bool]: return self._get_info("is_trained") @property - def modification_time(self): + def modification_time(self) -> Optional[datetime]: return self._get_info("modification_time") - def suggest_corpus(self, corpus, backend_params=None): + def suggest_corpus( + self, + corpus: Union[DocumentDirectory, DocumentList], + backend_params: None = None, + ) -> annif.suggestion.SuggestionResults: """Suggest subjects for the given documents corpus in batches of documents.""" suggestions = ( self.suggest([doc.text for doc in doc_batch], backend_params) for doc_batch in corpus.doc_batches ) - import annif.suggestion return annif.suggestion.SuggestionResults(suggestions) - def suggest(self, texts, backend_params=None): + def suggest( + self, + texts: List[str], + backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, + ) -> annif.suggestion.SuggestionBatch: """Suggest subjects for the given documents batch.""" if not self.is_trained: if self.is_trained is None: @@ -216,7 +265,12 @@ def suggest(self, texts, backend_params=None): texts = [self.transform.transform_text(text) for text in texts] return self._suggest_with_backend(texts, backend_params) - def train(self, corpus, backend_params=None, jobs=0): + def train( + self, + corpus: Union[CombinedCorpus, LimitingDocumentCorpus, DocumentFile, str], + backend_params: None = None, + jobs: int = 0, + ) -> None: """train the project using documents from a metadata source""" if corpus != "cached": corpus = self.transform.transform_corpus(corpus) @@ -225,7 +279,11 @@ def train(self, corpus, backend_params=None, jobs=0): beparams = backend_params.get(self.backend.backend_id, {}) self.backend.train(corpus, beparams, jobs) - def learn(self, corpus, backend_params=None): + def learn( + self, + corpus: Union[DocumentDirectory, DocumentFile, DocumentList], + backend_params: None = None, + ) -> None: """further train the project using documents from a metadata source""" if backend_params is None: backend_params = {} @@ -238,7 +296,14 @@ def learn(self, corpus, backend_params=None): "Learning not supported by backend", project_id=self.project_id ) - def hyperopt(self, corpus, trials, jobs, metric, results_file): + def hyperopt( + self, + corpus: DocumentDirectory, + trials: int, + jobs: int, + metric: str, + results_file: Optional[LazyFile], + ) -> HPRecommendation: """optimize the hyperparameters of the project using a validation corpus against a given metric""" if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend): @@ -250,7 +315,7 @@ def hyperopt(self, corpus, trials, jobs, metric, results_file): project_id=self.project_id, ) - def dump(self): + def dump(self) -> Dict[str, Optional[Union[str, Dict[str, str], bool, datetime]]]: """return this project as a dict""" return { "project_id": self.project_id, @@ -261,7 +326,7 @@ def dump(self): "modification_time": self.modification_time, } - def remove_model_data(self): + def remove_model_data(self) -> None: """remove the data of this project""" datadir_path = self._datadir_path if os.path.isdir(datadir_path): diff --git a/annif/registry.py b/annif/registry.py index e0368b1e3..6d8db92dc 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -1,7 +1,8 @@ """Registry that keeps track of Annif projects""" +from __future__ import annotations -import collections import re +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from flask import current_app @@ -12,6 +13,11 @@ from annif.util import parse_args from annif.vocab import AnnifVocabulary +if TYPE_CHECKING: + from collections import OrderedDict + + from werkzeug.local import LocalProxy + logger = annif.logger @@ -28,7 +34,9 @@ class AnnifRegistry: _projects = {} _vocabs = {} - def __init__(self, projects_config_path, datadir, init_projects): + def __init__( + self, projects_config_path: str, datadir: str, init_projects: bool + ) -> None: self._rid = id(self) self._projects_config_path = projects_config_path self._datadir = datadir @@ -37,13 +45,13 @@ def __init__(self, projects_config_path, datadir, init_projects): for project in self._projects[self._rid].values(): project.initialize() - def _init_vars(self): + def _init_vars(self) -> None: # initialize the static variables, if necessary if self._rid not in self._projects: self._projects[self._rid] = self._create_projects() self._vocabs[self._rid] = {} - def _create_projects(self): + def _create_projects(self) -> collections.OrderedDict: # parse the configuration config = parse_config(self._projects_config_path) @@ -59,7 +67,9 @@ def _create_projects(self): ) return projects - def get_projects(self, min_access=Access.private): + def get_projects( + self, min_access: Access = Access.private + ) -> Dict[str, AnnifProject]: """Return the available projects as a dict of project_id -> AnnifProject. The min_access parameter may be used to set the minimum access level required for the returned projects.""" @@ -71,7 +81,9 @@ def get_projects(self, min_access=Access.private): if project.access >= min_access } - def get_project(self, project_id, min_access=Access.private): + def get_project( + self, project_id: str, min_access: Access = Access.private + ) -> AnnifProject: """return the definition of a single Project by project_id""" projects = self.get_projects(min_access) @@ -80,7 +92,9 @@ def get_project(self, project_id, min_access=Access.private): except KeyError: raise ValueError("No such project {}".format(project_id)) - def get_vocab(self, vocab_spec, default_language): + def get_vocab( + self, vocab_spec: str, default_language: Optional[str] + ) -> Union[Tuple[AnnifVocabulary, None], Tuple[AnnifVocabulary, str]]: """Return an (AnnifVocabulary, language) pair corresponding to the vocab_spec. If no language information is specified, use the given default language.""" @@ -101,14 +115,14 @@ def get_vocab(self, vocab_spec, default_language): return self._vocabs[self._rid][vocab_key], language -def initialize_projects(app): +def initialize_projects(app: LocalProxy) -> None: projects_config_path = app.config["PROJECTS_CONFIG_PATH"] datadir = app.config["DATADIR"] init_projects = app.config["INITIALIZE_PROJECTS"] app.annif_registry = AnnifRegistry(projects_config_path, datadir, init_projects) -def get_projects(min_access=Access.private): +def get_projects(min_access: Access = Access.private) -> Dict[str, AnnifProject]: """Return the available projects as a dict of project_id -> AnnifProject. The min_access parameter may be used to set the minimum access level required for the returned projects.""" @@ -118,7 +132,7 @@ def get_projects(min_access=Access.private): return current_app.annif_registry.get_projects(min_access) -def get_project(project_id, min_access=Access.private): +def get_project(project_id: str, min_access: Access = Access.private) -> AnnifProject: """return the definition of a single Project by project_id""" projects = get_projects(min_access) @@ -128,7 +142,7 @@ def get_project(project_id, min_access=Access.private): raise ValueError(f"No such project '{project_id}'") -def get_vocabs(min_access=Access.private): +def get_vocabs(min_access: Access = Access.private) -> Dict[str, AnnifVocabulary]: """Return the available vocabularies as a dict of vocab_id -> AnnifVocabulary. The min_access parameter may be used to set the minimum access level required for the returned vocabularies.""" @@ -143,7 +157,7 @@ def get_vocabs(min_access=Access.private): return vocabs -def get_vocab(vocab_id, min_access=Access.private): +def get_vocab(vocab_id: str, min_access: Access = Access.private) -> AnnifVocabulary: """return a single AnnifVocabulary by vocabulary id""" vocabs = get_vocabs(min_access) diff --git a/annif/rest.py b/annif/rest.py index 0b3b87efe..4101d856d 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -1,7 +1,9 @@ """Definitions for REST API operations. These are wired via Connexion to methods defined in the OpenAPI specification.""" +from __future__ import annotations import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import connexion @@ -10,8 +12,18 @@ from annif.exception import AnnifException from annif.project import Access +if TYPE_CHECKING: + from datetime import datetime -def project_not_found_error(project_id): + from connexion.lifecycle import ConnexionResponse + + from annif.corpus.document import DocumentList + from annif.corpus.subject import SubjectIndex + from annif.exception import ConfigurationException, NotSupportedException + from annif.suggestion import SubjectSuggestion, SuggestionResults + + +def project_not_found_error(project_id: str) -> ConnexionResponse: """return a Connexion error object when a project is not found""" return connexion.problem( @@ -21,7 +33,9 @@ def project_not_found_error(project_id): ) -def server_error(err): +def server_error( + err: Union[ConfigurationException, NotSupportedException] +) -> ConnexionResponse: """return a Connexion error object when there is a server error (project or backend problem)""" @@ -30,13 +44,13 @@ def server_error(err): ) -def show_info(): +def show_info() -> Dict[str, str]: """return version of annif and a title for the api according to OpenAPI spec""" return {"title": "Annif REST API", "version": importlib.metadata.version("annif")} -def language_not_supported_error(lang): +def language_not_supported_error(lang: str) -> ConnexionResponse: """return a Connexion error object when attempting to use unsupported language""" return connexion.problem( @@ -46,7 +60,9 @@ def language_not_supported_error(lang): ) -def list_projects(): +def list_projects() -> ( + Dict[str, List[Dict[str, Optional[Union[str, Dict[str, str], bool, datetime]]]]] +): """return a dict with projects formatted according to OpenAPI spec""" return { @@ -57,7 +73,9 @@ def list_projects(): } -def show_project(project_id): +def show_project( + project_id: str, +) -> Union[Dict[str, Optional[Union[str, Dict[str, str], bool]]], ConnexionResponse]: """return a single project formatted according to OpenAPI spec""" try: @@ -67,7 +85,9 @@ def show_project(project_id): return project.dump() -def _suggestion_to_dict(suggestion, subject_index, language): +def _suggestion_to_dict( + suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str +) -> Dict[str, Optional[Union[str, float]]]: subject = subject_index[suggestion.subject_id] return { "uri": subject.uri, @@ -77,21 +97,44 @@ def _suggestion_to_dict(suggestion, subject_index, language): } -def _hit_sets_to_list(hit_sets, subjects, lang): +def _hit_sets_to_list( + hit_sets: SuggestionResults, subjects: SubjectIndex, lang: str +) -> List[ + Union[ + Dict[str, List[Any]], + Any, + Dict[str, List[Dict[str, Union[str, float]]]], + Dict[str, List[Dict[str, Optional[Union[str, float]]]]], + ] +]: return [ {"results": [_suggestion_to_dict(hit, subjects, lang) for hit in hits]} for hits in hit_sets ] -def _is_error(result): +def _is_error( + result: Union[ + List[Dict[str, List[Any]]], + List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], + List[Dict[str, List[Dict[str, Union[str, float]]]]], + ConnexionResponse, + ] +) -> bool: return ( isinstance(result, connexion.lifecycle.ConnexionResponse) and result.status_code >= 400 ) -def suggest(project_id, body): +def suggest( + project_id: str, body: Dict[str, Union[int, float, str]] +) -> Union[ + Dict[str, List[Any]], + Dict[str, List[Dict[str, Optional[Union[str, float]]]]], + ConnexionResponse, + Dict[str, List[Dict[str, Union[str, float]]]], +]: """suggest subjects for the given text and return a dict with results formatted according to OpenAPI spec""" @@ -106,7 +149,16 @@ def suggest(project_id, body): return result[0] -def suggest_batch(project_id, body, **query_parameters): +def suggest_batch( + project_id: str, + body: Dict[str, Union[List[Any], List[Dict[str, str]]]], + **query_parameters, +) -> Union[ + List[Dict[str, None]], + List[Dict[str, Optional[List[Dict[str, Optional[Union[str, float]]]]]]], + List[Dict[str, Union[List[Dict[str, Optional[Union[str, float]]]], str]]], + ConnexionResponse, +]: """suggest subjects for the given documents and return a list of dicts with results formatted according to OpenAPI spec""" @@ -120,7 +172,16 @@ def suggest_batch(project_id, body, **query_parameters): return result -def _suggest(project_id, documents, parameters): +def _suggest( + project_id: str, + documents: List[Union[Dict[str, str], Any]], + parameters: Dict[str, Union[int, float, str]], +) -> Union[ + List[Dict[str, List[Any]]], + List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], + List[Dict[str, List[Dict[str, Union[str, float]]]]], + ConnexionResponse, +]: corpus = _documents_to_corpus(documents, subject_index=None) try: project = annif.registry.get_project(project_id, min_access=Access.hidden) @@ -146,7 +207,12 @@ def _suggest(project_id, documents, parameters): return _hit_sets_to_list(hit_sets, project.subjects, lang) -def _documents_to_corpus(documents, subject_index): +def _documents_to_corpus( + documents: List[ + Union[Dict[str, str], Dict[str, Union[List[Dict[str, str]], str]], Any] + ], + subject_index: Optional[SubjectIndex], +) -> annif.corpus.document.DocumentList: if subject_index is not None: corpus = [ Document( @@ -165,7 +231,16 @@ def _documents_to_corpus(documents, subject_index): return DocumentList(corpus) -def learn(project_id, body): +def learn( + project_id: str, + body: List[ + Union[ + Dict[str, Union[List[Dict[str, str]], str]], + Any, + Dict[str, Optional[List[bool]]], + ] + ], +) -> Union[ConnexionResponse, Tuple[None, int]]: """learn from documents and return an empty 204 response if succesful""" try: diff --git a/annif/suggestion.py b/annif/suggestion.py index 9e967d4bf..fa5af7e80 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -1,15 +1,23 @@ """Representing suggested subjects.""" +from __future__ import annotations import collections -import itertools +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union -import numpy as np from scipy.sparse import csr_array +if TYPE_CHECKING: + from itertools import chain + + from numpy import ndarray + from scipy.sparse._arrays import csr_array + + from annif.corpus.subject import SubjectIndex + SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score") -def vector_to_suggestions(vector, limit): +def vector_to_suggestions(vector: ndarray, limit: int) -> Iterator[Any]: limit = min(len(vector), limit) topk_idx = np.argpartition(vector, -limit)[-limit:] return ( @@ -17,7 +25,11 @@ def vector_to_suggestions(vector, limit): ) -def filter_suggestion(preds, limit=None, threshold=0.0): +def filter_suggestion( + preds: scipy.sparse._arrays.csr_array, + limit: Optional[int] = None, + threshold: Union[int, float] = 0.0, +) -> scipy.sparse._arrays.csr_array: """filter a 2D sparse suggestion array (csr_array), retaining only the top K suggestions with a score above or equal to the threshold for each individual prediction; the rest will be left as zeros""" @@ -43,7 +55,7 @@ def filter_suggestion(preds, limit=None, threshold=0.0): class SuggestionResult: """Suggestions for a single document, backed by a row of a sparse array.""" - def __init__(self, array, idx): + def __init__(self, array: scipy.sparse._arrays.csr_array, idx: int) -> None: self._array = array self._idx = idx @@ -57,10 +69,10 @@ def __iter__(self): sorted(suggestions, key=lambda suggestion: suggestion.score, reverse=True) ) - def as_vector(self): + def as_vector(self) -> ndarray: return self._array[[self._idx], :].toarray()[0] - def __len__(self): + def __len__(self) -> int: _, cols = self._array[[self._idx], :].nonzero() return len(cols) @@ -68,13 +80,18 @@ def __len__(self): class SuggestionBatch: """Subject suggestions for a batch of documents.""" - def __init__(self, array): + def __init__(self, array: scipy.sparse._arrays.csr_array) -> None: """Create a new SuggestionBatch from a csr_array""" assert isinstance(array, csr_array) self.array = array @classmethod - def from_sequence(cls, suggestion_results, subject_index, limit=None): + def from_sequence( + cls, + suggestion_results: List[List[SubjectSuggestion]], + subject_index: SubjectIndex, + limit: Optional[int] = None, + ) -> "SuggestionBatch": """Create a new SuggestionBatch from a sequence where each item is a sequence of SubjectSuggestion objects.""" @@ -96,7 +113,9 @@ def from_sequence(cls, suggestion_results, subject_index, limit=None): ) @classmethod - def from_averaged(cls, batches, weights): + def from_averaged( + cls, batches: List[SuggestionBatch], weights: List[Union[int, float]] + ) -> "SuggestionBatch": """Create a new SuggestionBatch where the subject scores are the weighted average of scores in several SuggestionBatches""" @@ -105,31 +124,35 @@ def from_averaged(cls, batches, weights): ) / sum(weights) return SuggestionBatch(avg_array) - def filter(self, limit=None, threshold=0.0): + def filter( + self, limit: Optional[int] = None, threshold: float = 0.0 + ) -> "SuggestionBatch": """Return a subset of the hits, filtered by the given limit and score threshold, as another SuggestionBatch object.""" return SuggestionBatch(filter_suggestion(self.array, limit, threshold)) - def __getitem__(self, idx): + def __getitem__(self, idx: int) -> SuggestionResult: if idx < 0 or idx >= len(self): raise IndexError return SuggestionResult(self.array, idx) - def __len__(self): + def __len__(self) -> int: return self.array.shape[0] class SuggestionResults: """Subject suggestions for a potentially very large number of documents.""" - def __init__(self, batches): + def __init__(self, batches: List[SuggestionBatch]) -> None: """Initialize a new SuggestionResults from an iterable that provides SuggestionBatch objects.""" self.batches = batches - def filter(self, limit=None, threshold=0.0): + def filter( + self, limit: Optional[int] = None, threshold: float = 0.0 + ) -> "SuggestionResults": """Return a view of these suggestions, filtered by the given limit and/or threshold, as another SuggestionResults object.""" @@ -137,5 +160,5 @@ def filter(self, limit=None, threshold=0.0): (batch.filter(limit, threshold) for batch in self.batches) ) - def __iter__(self): + def __iter__(self) -> itertools.chain: return iter(itertools.chain.from_iterable(self.batches)) diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index 59317f3f6..bda282bed 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -1,6 +1,8 @@ """Functionality for obtaining text transformation from string specification""" +from __future__ import annotations import re +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import annif from annif.exception import ConfigurationException @@ -8,8 +10,23 @@ from . import inputlimiter, transform +if TYPE_CHECKING: + from unittest.mock import Mock -def parse_specs(transform_specs): + from annif.project import AnnifProject + from annif.transform.transform import TransformChain + + +def parse_specs( + transform_specs: str, +) -> List[ + Union[ + Tuple[str, List[Any], Dict[Any, Any]], + Tuple[str, List[str], Dict[str, str]], + Tuple[str, List[str], Dict[Any, Any]], + Tuple[str, List[Any], Dict[str, str]], + ] +]: """Parse a transformation specification into a list of tuples, e.g. 'transf_1(x),transf_2(y=42),transf_3' is parsed to [(transf_1, [x], {}), (transf_2, [], {y: 42}), (transf_3, [], {})].""" @@ -27,7 +44,9 @@ def parse_specs(transform_specs): return parsed -def get_transform(transform_specs, project): +def get_transform( + transform_specs: str, project: Optional[Union[AnnifProject, Mock]] +) -> TransformChain: transform_defs = parse_specs(transform_specs) transform_classes = [] args = [] diff --git a/annif/transform/inputlimiter.py b/annif/transform/inputlimiter.py index 6883c4c9b..6b8c3e9e1 100644 --- a/annif/transform/inputlimiter.py +++ b/annif/transform/inputlimiter.py @@ -1,23 +1,33 @@ """A simple transformation that truncates the text of input documents to a given character length.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union from annif.exception import ConfigurationException from . import transform +if TYPE_CHECKING: + from unittest.mock import Mock + + from annif.project import AnnifProject + class InputLimiter(transform.BaseTransform): name = "limit" - def __init__(self, project, input_limit): + def __init__( + self, project: Optional[Union[Mock, AnnifProject]], input_limit: str + ) -> None: super().__init__(project) self.input_limit = int(input_limit) self._validate_value(self.input_limit) - def transform_fn(self, text): + def transform_fn(self, text: str) -> str: return text[: self.input_limit] - def _validate_value(self, input_limit): + def _validate_value(self, input_limit: int) -> None: if input_limit < 0: raise ConfigurationException( "input_limit in limit_input transform cannot be negative", diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index 8ee6285a7..3e63ee6d0 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -1,5 +1,8 @@ """Transformation filtering out parts of a text that are in a language different from the language of the project.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Union from simplemma.langdetect import in_target_language @@ -7,6 +10,9 @@ from . import transform +if TYPE_CHECKING: + from unittest.mock import Mock + logger = annif.logger @@ -14,14 +20,18 @@ class LangFilter(transform.BaseTransform): name = "filter_lang" def __init__( - self, project, text_min_length=500, sentence_min_length=50, min_ratio=0.5 - ): + self, + project: Mock, + text_min_length: Union[int, str] = 500, + sentence_min_length: Union[int, str] = 50, + min_ratio: float = 0.5, + ) -> None: super().__init__(project) self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) self.min_ratio = float(min_ratio) - def transform_fn(self, text): + def transform_fn(self, text: str) -> str: if len(text) < self.text_min_length: return text diff --git a/annif/transform/transform.py b/annif/transform/transform.py index 42123ab56..a5d4b623c 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -1,10 +1,27 @@ """Common functionality for transforming text of input documents.""" +from __future__ import annotations import abc +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union from annif.corpus import TransformingDocumentCorpus from annif.exception import ConfigurationException +if TYPE_CHECKING: + from unittest.mock import Mock + + from annif.corpus.combine import CombinedCorpus + from annif.corpus.document import ( + DocumentDirectory, + DocumentFile, + DocumentList, + LimitingDocumentCorpus, + TransformingDocumentCorpus, + ) + from annif.project import AnnifProject + from annif.transform.inputlimiter import InputLimiter + from annif.transform.langfilter import LangFilter + class BaseTransform(metaclass=abc.ABCMeta): """Base class for text transformations, which need to implement the @@ -12,7 +29,7 @@ class BaseTransform(metaclass=abc.ABCMeta): name = None - def __init__(self, project): + def __init__(self, project: Optional[Union[AnnifProject, Mock]]) -> None: self.project = project @abc.abstractmethod @@ -26,7 +43,7 @@ class IdentityTransform(BaseTransform): name = "pass" - def transform_fn(self, text): + def transform_fn(self, text: str) -> str: return text @@ -34,11 +51,36 @@ class TransformChain: """Class instantiating and holding the transformation objects performing the actual text transformation.""" - def __init__(self, transform_classes, args, project): + def __init__( + self, + transform_classes: List[ + Union[Type[InputLimiter], Type[IdentityTransform], Type[LangFilter]] + ], + args: List[ + Union[ + Tuple[List[Any], Dict[str, str]], + Tuple[List[str], Dict[Any, Any]], + Tuple[List[Any], Dict[Any, Any]], + ] + ], + project: Optional[Union[AnnifProject, Mock]], + ) -> None: self.project = project self.transforms = self._init_transforms(transform_classes, args) - def _init_transforms(self, transform_classes, args): + def _init_transforms( + self, + transform_classes: List[ + Union[Type[InputLimiter], Type[IdentityTransform], Type[LangFilter]] + ], + args: List[ + Union[ + Tuple[List[Any], Dict[str, str]], + Tuple[List[str], Dict[Any, Any]], + Tuple[List[Any], Dict[Any, Any]], + ] + ], + ) -> List[Union[InputLimiter, IdentityTransform, LangFilter]]: transforms = [] for trans, (posargs, kwargs) in zip(transform_classes, args): try: @@ -51,10 +93,19 @@ def _init_transforms(self, transform_classes, args): ) return transforms - def transform_text(self, text): + def transform_text(self, text: str) -> str: for trans in self.transforms: text = trans.transform_fn(text) return text - def transform_corpus(self, corpus): + def transform_corpus( + self, + corpus: Union[ + annif.corpus.document.DocumentDirectory, + annif.corpus.document.LimitingDocumentCorpus, + CombinedCorpus, + annif.corpus.document.DocumentList, + annif.corpus.document.DocumentFile, + ], + ) -> annif.corpus.document.TransformingDocumentCorpus: return TransformingDocumentCorpus(corpus, self.transform_text) diff --git a/annif/vocab.py b/annif/vocab.py index 14f6209ba..d63f0ca7d 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -1,6 +1,8 @@ """Vocabulary management functionality for Annif""" +from __future__ import annotations import os.path +from typing import TYPE_CHECKING, List, Union import annif import annif.corpus @@ -8,6 +10,12 @@ from annif.datadir import DatadirMixin from annif.exception import NotInitializedException +if TYPE_CHECKING: + from rdflib.graph import Graph + + from annif.corpus.skos import SubjectFileSKOS + from annif.corpus.subject import SubjectFileCSV, SubjectFileTSV, SubjectIndex + logger = annif.logger @@ -23,18 +31,20 @@ class AnnifVocabulary(DatadirMixin): INDEX_FILENAME_TTL = "subjects.ttl" INDEX_FILENAME_CSV = "subjects.csv" - def __init__(self, vocab_id, datadir): + def __init__(self, vocab_id: str, datadir: str) -> None: DatadirMixin.__init__(self, datadir, "vocabs", vocab_id) self.vocab_id = vocab_id self._skos_vocab = None - def _create_subject_index(self, subject_corpus): + def _create_subject_index( + self, subject_corpus: Union[SubjectFileCSV, SubjectFileTSV, SubjectFileSKOS] + ) -> SubjectIndex: subjects = annif.corpus.SubjectIndex() subjects.load_subjects(subject_corpus) annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV) return subjects - def _update_subject_index(self, subject_corpus): + def _update_subject_index(self, subject_corpus: SubjectFileTSV) -> SubjectIndex: old_subjects = self.subjects new_subjects = annif.corpus.SubjectIndex() new_subjects.load_subjects(subject_corpus) @@ -55,7 +65,7 @@ def _update_subject_index(self, subject_corpus): return updated_subjects @property - def subjects(self): + def subjects(self) -> SubjectIndex: if self._subjects is None: path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV) if os.path.exists(path): @@ -66,7 +76,7 @@ def subjects(self): return self._subjects @property - def skos(self): + def skos(self) -> SubjectFileSKOS: """return the subject vocabulary from SKOS file""" if self._skos_vocab is not None: return self._skos_vocab @@ -94,14 +104,18 @@ def skos(self): raise NotInitializedException(f"graph file {path} not found") - def __len__(self): + def __len__(self) -> int: return len(self.subjects) @property - def languages(self): + def languages(self) -> List[str]: return self.subjects.languages - def load_vocabulary(self, subject_corpus, force=False): + def load_vocabulary( + self, + subject_corpus: Union[SubjectFileCSV, SubjectFileTSV, SubjectFileSKOS], + force: bool = False, + ) -> None: """Load subjects from a subject corpus and save them into one or more subject index files as well as a SKOS/Turtle file for later use. If force=True, replace the existing subject index completely.""" @@ -119,6 +133,6 @@ def load_vocabulary(self, subject_corpus, force=False): logger.info(f"saving vocabulary into SKOS file {skosfile}") subject_corpus.save_skos(skosfile) - def as_graph(self): + def as_graph(self) -> Graph: """return the vocabulary as an rdflib graph""" return self.skos.graph From c8a04cbd709687701653612d9df0f470ddf2e384 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 12:49:00 +0300 Subject: [PATCH 02/28] Use dict instead of OrderedDict --- annif/registry.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/annif/registry.py b/annif/registry.py index 6d8db92dc..7e631221b 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -14,8 +14,6 @@ from annif.vocab import AnnifVocabulary if TYPE_CHECKING: - from collections import OrderedDict - from werkzeug.local import LocalProxy logger = annif.logger @@ -51,7 +49,7 @@ def _init_vars(self) -> None: self._projects[self._rid] = self._create_projects() self._vocabs[self._rid] = {} - def _create_projects(self) -> collections.OrderedDict: + def _create_projects(self) -> Dict: # parse the configuration config = parse_config(self._projects_config_path) @@ -60,7 +58,7 @@ def _create_projects(self) -> collections.OrderedDict: return {} # create AnnifProject objects from the configuration file - projects = collections.OrderedDict() + projects = dict() for project_id in config.project_ids: projects[project_id] = AnnifProject( project_id, config[project_id], self._datadir, self From 7ed97a562682c7d9dfc4fd6c8cc1f442957d63f5 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 13:06:59 +0300 Subject: [PATCH 03/28] Make imports that were too eagerly made conditional default again --- annif/backend/mllm.py | 8 ++++---- annif/backend/nn_ensemble.py | 6 +++--- annif/backend/svc.py | 4 ++-- annif/config.py | 6 ++---- annif/corpus/combine.py | 3 +-- annif/corpus/subject.py | 10 +++++----- annif/eval.py | 8 ++++---- annif/lexical/mllm.py | 22 ++++++++++++---------- annif/suggestion.py | 9 ++++----- 9 files changed, 37 insertions(+), 39 deletions(-) diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index da6d1799b..6f460dee1 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Tuple, Union import joblib +import numpy as np import annif.eval import annif.util @@ -15,7 +16,6 @@ from . import backend, hyperopt if TYPE_CHECKING: - from numpy import float64, ndarray from optuna.study.study import Study from optuna.trial._trial import Trial @@ -39,7 +39,7 @@ def _prepare(self, n_jobs: int = 1) -> None: self._candidates.append(candidates) self._gold_subjects.append(doc.subject_set) - def _objective(self, trial: Trial) -> float: + def _objective(self, trial: Trial) -> np.float: params = { "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 30), "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 100, 2000), @@ -108,7 +108,7 @@ def _load_model(self) -> MLLMModel: "model {} not found".format(path), backend_id=self.backend_id ) - def _load_train_data(self) -> Tuple[ndarray, ndarray]: + def _load_train_data(self) -> Tuple[np.ndarray, np.ndarray]: path = os.path.join(self.datadir, self.TRAIN_FILE) if os.path.exists(path): return joblib.load(path) @@ -157,7 +157,7 @@ def _generate_candidates(self, text: str) -> List[Union[Candidate, Any]]: def _prediction_to_result( self, - prediction: List[Union[Tuple[float64, int], Any]], + prediction: List[Union[Tuple[np.float64, int], Any]], params: Dict[str, Union[int, float, bool, str]], ) -> Iterator[Any]: vector = np.zeros(len(self.project.subjects), dtype=np.float32) diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 1ea8989fb..5ef2a857f 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -9,6 +9,7 @@ import joblib import lmdb +import numpy as np import tensorflow.keras.backend as K from scipy.sparse import csc_matrix, csr_matrix from tensorflow.keras.layers import Add, Dense, Dropout, Flatten, Input, Layer @@ -24,7 +25,6 @@ from . import backend, ensemble if TYPE_CHECKING: - from numpy import ndarray from tensorflow.python.framework.ops import EagerTensor from annif.corpus.document import DocumentFile, LimitingDocumentCorpus @@ -53,7 +53,7 @@ def __init__(self, txn, batch_size): self._counter = 0 self._batch_size = batch_size - def add_sample(self, inputs: ndarray, targets: ndarray) -> None: + def add_sample(self, inputs: np.ndarray, targets: np.ndarray) -> None: # use zero-padded 8-digit key key = idx_to_key(self._counter) self._counter += 1 @@ -64,7 +64,7 @@ def add_sample(self, inputs: ndarray, targets: ndarray) -> None: buf.seek(0) self._txn.put(key, buf.read()) - def __getitem__(self, idx: int) -> Tuple[ndarray, ndarray]: + def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: """get a particular batch of samples""" cursor = self._txn.cursor() first_key = idx * self._batch_size diff --git a/annif/backend/svc.py b/annif/backend/svc.py index fbeab02e7..cd5014f35 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union import joblib +import numpy as np import scipy.special from sklearn.svm import LinearSVC @@ -15,7 +16,6 @@ from . import backend, mixins if TYPE_CHECKING: - from numpy import ndarray from scipy.sparse._csr import csr_matrix from annif.corpus.document import DocumentFile @@ -97,7 +97,7 @@ def _train( self._train_classifier(veccorpus, classes) def _scores_to_suggestions( - self, scores: ndarray, params: Dict[str, int] + self, scores: np.ndarray, params: Dict[str, int] ) -> List[SubjectSuggestion]: results = [] limit = int(params["limit"]) diff --git a/annif/config.py b/annif/config.py index 5ff51b25e..cad59e734 100644 --- a/annif/config.py +++ b/annif/config.py @@ -1,9 +1,10 @@ """Configuration file handling""" from __future__ import annotations +import configparser import os.path from glob import glob -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union import tomli @@ -11,9 +12,6 @@ import annif.util from annif.exception import ConfigurationException -if TYPE_CHECKING: - from configparser import SectionProxy - logger = annif.logger diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index 90bbf74d6..067c316e3 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,13 +1,12 @@ """Class for combining multiple corpora so they behave like a single corpus""" from __future__ import annotations +import itertools from typing import TYPE_CHECKING, List from .types import DocumentCorpus if TYPE_CHECKING: - from itertools import chain - from annif.corpus.document import DocumentFile diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index cd8c08bdb..789925e23 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -12,8 +12,6 @@ from .types import Subject, SubjectCorpus if TYPE_CHECKING: - from numpy import int32, ndarray - from annif.corpus.skos import SubjectFileSKOS from annif.corpus.types import Subject @@ -138,7 +136,7 @@ def __len__(self) -> int: def languages(self) -> List[str]: return self._languages - def __getitem__(self, subject_id: Union[int, int32]) -> Subject: + def __getitem__(self, subject_id: Union[int, np.int32]) -> Subject: return self._subjects[subject_id] def append(self, subject: Subject) -> None: @@ -280,13 +278,15 @@ def _parse_line( return uri, label def as_vector( - self, size: Optional[int] = None, destination: Optional[ndarray] = None - ) -> ndarray: + self, size: Optional[int] = None, destination: Optional[np.ndarray] = None + ) -> np.ndarray: """Return the hits as a one-dimensional NumPy array in sklearn multilabel indicator format. Use destination array if given (not None), otherwise create and return a new one of the given size.""" if destination is None: + import numpy as np + assert size is not None and size > 0 destination = np.zeros(size, dtype=bool) diff --git a/annif/eval.py b/annif/eval.py index aa7ba8870..64d61a3b8 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -4,6 +4,7 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union +import numpy as np import scipy.sparse from sklearn.metrics import f1_score, precision_score, recall_score @@ -14,7 +15,6 @@ from io import TextIOWrapper from click.utils import LazyFile - from numpy import float64 from scipy.sparse._arrays import csr_array from annif.corpus.subject import SubjectIndex, SubjectSet @@ -41,7 +41,7 @@ def false_negatives(y_true: csr_array, y_pred: csr_array) -> int: def dcg_score( y_true: csr_array, y_pred: csr_array, limit: Optional[int] = None -) -> float64: +) -> np.float64: """return the discounted cumulative gain (DCG) score for the selected labels vs. relevant labels""" @@ -151,7 +151,7 @@ def _evaluate_samples( y_true: csr_array, y_pred: csr_array, metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], - ) -> Dict[str, Union[float64, float, int]]: + ) -> Dict[str, Union[np.float64, float, int]]: y_pred_binary = y_pred > 0.0 # define the available metrics as lazy lambda functions @@ -288,7 +288,7 @@ def results( metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], results_file: Optional[Union[LazyFile, TextIOWrapper]] = None, language: Optional[str] = None, - ) -> Dict[str, Union[float64, float, int]]: + ) -> Dict[str, Union[np.float64, float, int]]: """evaluate a set of selected subjects against a gold standard using different metrics. If metrics is empty, use all available metrics. If results_file (file object) given, write results per subject to it diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 40af32774..8e8233915 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Tuple, Union import joblib +import numpy as np from rdflib.namespace import SKOS from sklearn.ensemble import BaggingClassifier from sklearn.feature_extraction.text import CountVectorizer @@ -24,7 +25,6 @@ ) if TYPE_CHECKING: - from numpy import float64, ndarray from rdflib.graph import Graph from rdflib.term import URIRef from sklearn.ensemble._bagging import BaggingClassifier @@ -104,7 +104,9 @@ def generate_candidates( return conflate_matches(matches, len(sentences)) -def candidates_to_features(candidates: List[Candidate], mdata: "ModelData") -> ndarray: +def candidates_to_features( + candidates: List[Candidate], mdata: "ModelData" +) -> np.ndarray: """Convert a list of Candidates to a NumPy feature matrix""" matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32) @@ -168,7 +170,7 @@ def _model_data(self) -> "ModelData": idf=self._idf, ) - def _candidates_to_features(self, candidates: List[Candidate]) -> ndarray: + def _candidates_to_features(self, candidates: List[Candidate]) -> np.ndarray: return candidates_to_features(candidates, self._model_data) @staticmethod @@ -286,7 +288,7 @@ def _calculate_idf( def _prepare_features( self, train_x: List[List[Union[Candidate, Any]]], n_jobs: int - ) -> List[ndarray]: + ) -> List[np.ndarray]: fc_args = {"mdata": self._model_data} jobs, pool_class = annif.parallel.get_pool(n_jobs) @@ -306,7 +308,7 @@ def prepare_train( analyzer: SnowballAnalyzer, params: Dict[str, Union[int, float, bool, str]], n_jobs: int, - ) -> Tuple[ndarray, ndarray]: + ) -> Tuple[np.ndarray, np.ndarray]: # create an index from the vocabulary terms subject_ids = self._prepare_train_index(vocab, analyzer, params) @@ -334,8 +336,8 @@ def _create_classifier( def train( self, - train_x: Union[ndarray, List[Tuple[int, int]]], - train_y: Union[List[bool], ndarray], + train_x: Union[np.ndarray, List[Tuple[int, int]]], + train_y: Union[List[bool], np.ndarray], params: Dict[str, Union[int, float, bool, str]], ) -> None: # fit the model on the training corpus @@ -351,14 +353,14 @@ def train( ) def _prediction_to_list( - self, scores: ndarray, candidates: List[Candidate] - ) -> List[Tuple[float64, int]]: + self, scores: np.ndarray, candidates: List[Candidate] + ) -> List[Tuple[np.float64, int]]: subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)] return sorted(subj_scores, reverse=True) def predict( self, candidates: List[Union[Candidate, Any]] - ) -> List[Union[Any, Tuple[float64, int]]]: + ) -> List[Union[Any, Tuple[np.float64, int]]]: if not candidates: return [] features = self._candidates_to_features(candidates) diff --git a/annif/suggestion.py b/annif/suggestion.py index fa5af7e80..720232f6e 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -2,14 +2,13 @@ from __future__ import annotations import collections +import itertools from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union +import numpy as np from scipy.sparse import csr_array if TYPE_CHECKING: - from itertools import chain - - from numpy import ndarray from scipy.sparse._arrays import csr_array from annif.corpus.subject import SubjectIndex @@ -17,7 +16,7 @@ SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score") -def vector_to_suggestions(vector: ndarray, limit: int) -> Iterator[Any]: +def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator[Any]: limit = min(len(vector), limit) topk_idx = np.argpartition(vector, -limit)[-limit:] return ( @@ -69,7 +68,7 @@ def __iter__(self): sorted(suggestions, key=lambda suggestion: suggestion.score, reverse=True) ) - def as_vector(self) -> ndarray: + def as_vector(self) -> np.ndarray: return self._array[[self._idx], :].toarray()[0] def __len__(self) -> int: From 1a36c0950b6640251a4043a5daf782f745f85d09 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 14:41:12 +0300 Subject: [PATCH 04/28] Fix flake8 errors --- annif/__init__.py | 6 +++--- annif/analyzer/analyzer.py | 2 +- annif/analyzer/simple.py | 2 -- annif/analyzer/simplemma.py | 2 -- annif/analyzer/snowball.py | 1 - annif/analyzer/spacy.py | 2 +- annif/analyzer/voikko.py | 2 +- annif/backend/pav.py | 4 +--- annif/corpus/skos.py | 2 -- annif/corpus/subject.py | 3 ++- annif/corpus/types.py | 2 +- annif/datadir.py | 1 - annif/lexical/mllm.py | 3 +-- annif/lexical/util.py | 12 ++++-------- annif/openapi/validation.py | 2 +- annif/project.py | 1 - annif/rest.py | 1 - annif/suggestion.py | 10 ++++------ annif/transform/transform.py | 20 ++------------------ 19 files changed, 22 insertions(+), 56 deletions(-) diff --git a/annif/__init__.py b/annif/__init__.py index 221835da5..6258afb0a 100644 --- a/annif/__init__.py +++ b/annif/__init__.py @@ -5,12 +5,12 @@ import logging import os import os.path +from typing import TYPE_CHECKING, Optional logging.basicConfig() logger = logging.getLogger("annif") logger.setLevel(level=logging.INFO) -from typing import TYPE_CHECKING, Optional import annif.backend # noqa @@ -18,7 +18,7 @@ from flask.app import Flask -def create_flask_app(config_name: None = None) -> flask.app.Flask: +def create_flask_app(config_name: None = None) -> Flask: """Create a Flask app to be used by the CLI.""" from flask import Flask @@ -30,7 +30,7 @@ def create_flask_app(config_name: None = None) -> flask.app.Flask: return app -def create_app(config_name: Optional[str] = None) -> flask.app.Flask: +def create_app(config_name: Optional[str] = None) -> Flask: """Create a Connexion app to be used for the API.""" # 'cxapp' here is the Connexion application that has a normal Flask app # as a property (cxapp.app) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 5eb45853d..00657a094 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -4,7 +4,7 @@ import abc import functools import unicodedata -from typing import TYPE_CHECKING, Any, List, Union +from typing import Any, List, Union _KEY_TOKEN_MIN_LENGTH = "token_min_length" diff --git a/annif/analyzer/simple.py b/annif/analyzer/simple.py index 15e386d0a..4cc35e6f1 100644 --- a/annif/analyzer/simple.py +++ b/annif/analyzer/simple.py @@ -1,8 +1,6 @@ """Simple analyzer for Annif. Only folds words to lower case.""" from __future__ import annotations -from typing import TYPE_CHECKING - from . import analyzer diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py index 9ee0fcbb8..e535b25de 100644 --- a/annif/analyzer/simplemma.py +++ b/annif/analyzer/simplemma.py @@ -1,8 +1,6 @@ """Simplemma analyzer for Annif, based on simplemma lemmatizer.""" from __future__ import annotations -from typing import TYPE_CHECKING - import simplemma from . import analyzer diff --git a/annif/analyzer/snowball.py b/annif/analyzer/snowball.py index 7f0b370d8..57990c2a1 100644 --- a/annif/analyzer/snowball.py +++ b/annif/analyzer/snowball.py @@ -2,7 +2,6 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING from . import analyzer diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py index d3a4c649e..6579e861b 100644 --- a/annif/analyzer/spacy.py +++ b/annif/analyzer/spacy.py @@ -1,7 +1,7 @@ """spaCy analyzer for Annif which uses spaCy for lemmatization""" from __future__ import annotations -from typing import TYPE_CHECKING, List +from typing import List import annif.util from annif.exception import OperationFailedException diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index 24db55918..1006ce358 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -2,7 +2,7 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING, Dict, Optional +from typing import Dict, Optional import voikko.libvoikko diff --git a/annif/backend/pav.py b/annif/backend/pav.py index 125be6aed..dc35eb5fb 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -20,8 +20,6 @@ from . import backend, ensemble if TYPE_CHECKING: - from scipy.sparse._csc import csc_matrix - from annif.corpus.document import DocumentFile from annif.project import AnnifProject @@ -97,7 +95,7 @@ def _merge_source_batches( @staticmethod def _suggest_train_corpus( source_project: AnnifProject, corpus: DocumentFile - ) -> Tuple[scipy.sparse._csc.csc_matrix, scipy.sparse._csc.csc_matrix]: + ) -> Tuple[csc_matrix, csc_matrix]: # lists for constructing score matrix data, row, col = [], [], [] # lists for constructing true label matrix diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 2d9ad6fc0..17d84e692 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -27,8 +27,6 @@ if TYPE_CHECKING: from rdflib.term import URIRef - from annif.corpus.types import Subject - def serialize_subjects_to_skos(subjects: Iterator[Any], path: str) -> None: """Create a SKOS representation of the given subjects and serialize it diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 789925e23..607e6c503 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -12,8 +12,9 @@ from .types import Subject, SubjectCorpus if TYPE_CHECKING: + import numpy as np + from annif.corpus.skos import SubjectFileSKOS - from annif.corpus.types import Subject logger = annif.logger.getChild("subject") logger.addFilter(annif.util.DuplicateFilter()) diff --git a/annif/corpus/types.py b/annif/corpus/types.py index 3a4a7e02a..3a7531174 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -4,7 +4,7 @@ import abc import collections from itertools import islice -from typing import TYPE_CHECKING, Iterator, List +from typing import Iterator, List Document = collections.namedtuple("Document", "text subject_set") diff --git a/annif/datadir.py b/annif/datadir.py index 84ea61fe9..752da32dd 100644 --- a/annif/datadir.py +++ b/annif/datadir.py @@ -3,7 +3,6 @@ import os import os.path -from typing import TYPE_CHECKING class DatadirMixin: diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 8e8233915..5cf80cd4c 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -27,7 +27,6 @@ if TYPE_CHECKING: from rdflib.graph import Graph from rdflib.term import URIRef - from sklearn.ensemble._bagging import BaggingClassifier from annif.analyzer.snowball import SnowballAnalyzer from annif.corpus.document import DocumentDirectory @@ -325,7 +324,7 @@ def prepare_train( def _create_classifier( self, params: Dict[str, Union[int, float, bool, str]] - ) -> sklearn.ensemble._bagging.BaggingClassifier: + ) -> BaggingClassifier: return BaggingClassifier( DecisionTreeClassifier( min_samples_leaf=int(params["min_samples_leaf"]), diff --git a/annif/lexical/util.py b/annif/lexical/util.py index abd0c91e4..a2c6110e6 100644 --- a/annif/lexical/util.py +++ b/annif/lexical/util.py @@ -10,14 +10,12 @@ if TYPE_CHECKING: from rdflib.graph import Graph - from rdflib.term import URIRef - from scipy.sparse._csc import csc_matrix from annif.vocab import AnnifVocabulary def get_subject_labels( - graph: Graph, uri: str, properties: List[rdflib.term.URIRef], language: str + graph: Graph, uri: str, properties: List[URIRef], language: str ) -> List[Union[Any, str]]: return [ str(label) @@ -28,8 +26,8 @@ def get_subject_labels( def make_relation_matrix( - graph: Graph, vocab: AnnifVocabulary, property: rdflib.term.URIRef -) -> scipy.sparse._csc.csc_matrix: + graph: Graph, vocab: AnnifVocabulary, property: URIRef +) -> csc_matrix: n_subj = len(vocab.subjects) matrix = lil_matrix((n_subj, n_subj), dtype=bool) @@ -42,9 +40,7 @@ def make_relation_matrix( return csc_matrix(matrix) -def make_collection_matrix( - graph: Graph, vocab: AnnifVocabulary -) -> scipy.sparse._csc.csc_matrix: +def make_collection_matrix(graph: Graph, vocab: AnnifVocabulary) -> csc_matrix: # make an index with all collection members c_members = collections.defaultdict(list) for coll, member in graph.subject_objects(SKOS.member): diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py index 77d732e64..0b2caec47 100644 --- a/annif/openapi/validation.py +++ b/annif/openapi/validation.py @@ -2,7 +2,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import jsonschema from connexion import decorators diff --git a/annif/project.py b/annif/project.py index 75345dee2..fa482c67f 100644 --- a/annif/project.py +++ b/annif/project.py @@ -41,7 +41,6 @@ ) from annif.corpus.subject import SubjectIndex from annif.registry import AnnifRegistry - from annif.suggestion import SuggestionBatch, SuggestionResults from annif.transform.transform import TransformChain from annif.vocab import AnnifVocabulary diff --git a/annif/rest.py b/annif/rest.py index 4101d856d..f6be1f7ae 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -17,7 +17,6 @@ from connexion.lifecycle import ConnexionResponse - from annif.corpus.document import DocumentList from annif.corpus.subject import SubjectIndex from annif.exception import ConfigurationException, NotSupportedException from annif.suggestion import SubjectSuggestion, SuggestionResults diff --git a/annif/suggestion.py b/annif/suggestion.py index 720232f6e..9935f1df1 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -9,8 +9,6 @@ from scipy.sparse import csr_array if TYPE_CHECKING: - from scipy.sparse._arrays import csr_array - from annif.corpus.subject import SubjectIndex SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score") @@ -25,10 +23,10 @@ def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator[Any]: def filter_suggestion( - preds: scipy.sparse._arrays.csr_array, + preds: csr_array, limit: Optional[int] = None, threshold: Union[int, float] = 0.0, -) -> scipy.sparse._arrays.csr_array: +) -> csr_array: """filter a 2D sparse suggestion array (csr_array), retaining only the top K suggestions with a score above or equal to the threshold for each individual prediction; the rest will be left as zeros""" @@ -54,7 +52,7 @@ def filter_suggestion( class SuggestionResult: """Suggestions for a single document, backed by a row of a sparse array.""" - def __init__(self, array: scipy.sparse._arrays.csr_array, idx: int) -> None: + def __init__(self, array: csr_array, idx: int) -> None: self._array = array self._idx = idx @@ -79,7 +77,7 @@ def __len__(self) -> int: class SuggestionBatch: """Subject suggestions for a batch of documents.""" - def __init__(self, array: scipy.sparse._arrays.csr_array) -> None: + def __init__(self, array: csr_array) -> None: """Create a new SuggestionBatch from a csr_array""" assert isinstance(array, csr_array) self.array = array diff --git a/annif/transform/transform.py b/annif/transform/transform.py index a5d4b623c..3947eade0 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -10,14 +10,7 @@ if TYPE_CHECKING: from unittest.mock import Mock - from annif.corpus.combine import CombinedCorpus - from annif.corpus.document import ( - DocumentDirectory, - DocumentFile, - DocumentList, - LimitingDocumentCorpus, - TransformingDocumentCorpus, - ) + from annif.corpus import DocumentCorpus from annif.project import AnnifProject from annif.transform.inputlimiter import InputLimiter from annif.transform.langfilter import LangFilter @@ -98,14 +91,5 @@ def transform_text(self, text: str) -> str: text = trans.transform_fn(text) return text - def transform_corpus( - self, - corpus: Union[ - annif.corpus.document.DocumentDirectory, - annif.corpus.document.LimitingDocumentCorpus, - CombinedCorpus, - annif.corpus.document.DocumentList, - annif.corpus.document.DocumentFile, - ], - ) -> annif.corpus.document.TransformingDocumentCorpus: + def transform_corpus(self, corpus: DocumentCorpus) -> TransformingDocumentCorpus: return TransformingDocumentCorpus(corpus, self.transform_text) From 1660e5638cf63622e0ee27ae590fc3abe721a3b4 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 15:00:53 +0300 Subject: [PATCH 05/28] Turn forward references non-strings --- annif/backend/ensemble.py | 2 +- annif/corpus/subject.py | 4 ++-- annif/lexical/mllm.py | 4 ++-- annif/lexical/tokenset.py | 2 +- annif/suggestion.py | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index b7c049421..f097563b3 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -74,7 +74,7 @@ class EnsembleOptimizer(hyperopt.HyperparameterOptimizer): """Hyperparameter optimizer for the ensemble backend""" def __init__( - self, backend: "EnsembleBackend", corpus: DocumentDirectory, metric: str + self, backend: EnsembleBackend, corpus: DocumentDirectory, metric: str ) -> None: super().__init__(backend, corpus, metric) self._sources = [ diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 607e6c503..a873bd15d 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -209,7 +209,7 @@ def save(self, path: str) -> None: writer.writerow(row) @classmethod - def load(cls, path: str) -> "SubjectIndex": + def load(cls, path: str) -> SubjectIndex: """Load a subject index from a CSV file and return it.""" corpus = SubjectFileCSV(path) @@ -251,7 +251,7 @@ def __eq__(self, other: Union[SubjectSet, List[int], Set[int]]) -> bool: @classmethod def from_string( cls, subj_data: str, subject_index: SubjectIndex, language: str - ) -> "SubjectSet": + ) -> SubjectSet: subject_ids = set() for line in subj_data.splitlines(): uri, label = cls._parse_line(line) diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 5cf80cd4c..b593dc261 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -158,7 +158,7 @@ def generate_candidates( return generate_candidates(text, analyzer, self._vectorizer, self._index) @property - def _model_data(self) -> "ModelData": + def _model_data(self) -> ModelData: return ModelData( broader=self._broader_matrix, narrower=self._narrower_matrix, @@ -370,5 +370,5 @@ def save(self, filename: str) -> List[str]: return joblib.dump(self, filename) @staticmethod - def load(filename: str) -> "MLLMModel": + def load(filename: str) -> MLLMModel: return joblib.load(filename) diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index 2c5895b26..e7499051c 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -30,7 +30,7 @@ def __len__(self) -> int: def __iter__(self): return iter(self._tokens) - def contains(self, other: "TokenSet") -> bool: + def contains(self, other: TokenSet) -> bool: """Returns True iff the tokens in the other TokenSet are all included within this TokenSet.""" diff --git a/annif/suggestion.py b/annif/suggestion.py index 9935f1df1..7e830572b 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -88,7 +88,7 @@ def from_sequence( suggestion_results: List[List[SubjectSuggestion]], subject_index: SubjectIndex, limit: Optional[int] = None, - ) -> "SuggestionBatch": + ) -> SuggestionBatch: """Create a new SuggestionBatch from a sequence where each item is a sequence of SubjectSuggestion objects.""" @@ -112,7 +112,7 @@ def from_sequence( @classmethod def from_averaged( cls, batches: List[SuggestionBatch], weights: List[Union[int, float]] - ) -> "SuggestionBatch": + ) -> SuggestionBatch: """Create a new SuggestionBatch where the subject scores are the weighted average of scores in several SuggestionBatches""" @@ -123,7 +123,7 @@ def from_averaged( def filter( self, limit: Optional[int] = None, threshold: float = 0.0 - ) -> "SuggestionBatch": + ) -> SuggestionBatch: """Return a subset of the hits, filtered by the given limit and score threshold, as another SuggestionBatch object.""" @@ -149,7 +149,7 @@ def __init__(self, batches: List[SuggestionBatch]) -> None: def filter( self, limit: Optional[int] = None, threshold: float = 0.0 - ) -> "SuggestionResults": + ) -> SuggestionResults: """Return a view of these suggestions, filtered by the given limit and/or threshold, as another SuggestionResults object.""" From 61decb73c22d2a44bfbec6a7e6a1e58a37fa145d Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 23 May 2023 17:36:15 +0300 Subject: [PATCH 06/28] Use less specific types Types used in tests do not cover all cases. This also avoids many Union[] sets. --- annif/analyzer/__init__.py | 14 +++----------- annif/backend/__init__.py | 37 +++++++++++++----------------------- annif/backend/backend.py | 25 ++++-------------------- annif/backend/dummy.py | 4 ++-- annif/backend/ensemble.py | 8 ++++---- annif/backend/fasttext.py | 11 +++++------ annif/backend/mllm.py | 6 +++--- annif/backend/nn_ensemble.py | 10 +++++----- annif/backend/omikuji.py | 6 +++--- annif/backend/pav.py | 8 ++++---- annif/backend/stwfsa.py | 6 +++--- annif/backend/svc.py | 8 ++++---- annif/backend/tfidf.py | 6 +++--- annif/backend/yake.py | 4 ++-- annif/cli_util.py | 10 ++-------- annif/corpus/subject.py | 6 +----- annif/lexical/mllm.py | 16 ++++++++-------- annif/project.py | 32 +++++++++---------------------- annif/transform/transform.py | 14 ++++---------- annif/vocab.py | 13 ++++++------- 20 files changed, 88 insertions(+), 156 deletions(-) diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 0e8d7c189..a0f93ced3 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -2,7 +2,7 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import annif from annif.util import parse_args @@ -10,11 +10,7 @@ from . import simple, simplemma, snowball if TYPE_CHECKING: - from annif.analyzer.simple import SimpleAnalyzer - from annif.analyzer.simplemma import SimplemmaAnalyzer - from annif.analyzer.snowball import SnowballAnalyzer - from annif.analyzer.spacy import SpacyAnalyzer - from annif.analyzer.voikko import VoikkoAnalyzer + from annif.analyzer.analyzer import Analyzer _analyzers = {} @@ -23,11 +19,7 @@ def register_analyzer(analyzer): _analyzers[analyzer.name] = analyzer -def get_analyzer( - analyzerspec: str, -) -> Union[ - SimplemmaAnalyzer, SimpleAnalyzer, SnowballAnalyzer, SpacyAnalyzer, VoikkoAnalyzer -]: +def get_analyzer(analyzerspec: str) -> Analyzer: match = re.match(r"(\w+)(\((.*)\))?", analyzerspec) if match is None: raise ValueError("Invalid analyzer specification {}".format(analyzerspec)) diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 08957bf02..a0418f3b2 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -4,34 +4,23 @@ from typing import TYPE_CHECKING, Any, Type if TYPE_CHECKING: - from annif.backend.dummy import DummyBackend - from annif.backend.ensemble import EnsembleBackend - from annif.backend.fasttext import FastTextBackend - from annif.backend.http import HTTPBackend - from annif.backend.mllm import MLLMBackend - from annif.backend.nn_ensemble import NNEnsembleBackend - from annif.backend.omikuji import OmikujiBackend - from annif.backend.pav import PAVBackend - from annif.backend.stwfsa import StwfsaBackend - from annif.backend.svc import SVCBackend - from annif.backend.tfidf import TFIDFBackend - from annif.backend.yake import YakeBackend + from annif.backend.backend import AnnifBackend # define functions for lazily importing each backend (alphabetical order) -def _dummy() -> Type[DummyBackend]: +def _dummy() -> Type[AnnifBackend]: from . import dummy return dummy.DummyBackend -def _ensemble() -> Type[EnsembleBackend]: +def _ensemble() -> Type[AnnifBackend]: from . import ensemble return ensemble.EnsembleBackend -def _fasttext() -> Type[FastTextBackend]: +def _fasttext() -> Type[AnnifBackend]: try: from . import fasttext @@ -40,19 +29,19 @@ def _fasttext() -> Type[FastTextBackend]: raise ValueError("fastText not available, cannot use fasttext backend") -def _http() -> Type[HTTPBackend]: +def _http() -> Type[AnnifBackend]: from . import http return http.HTTPBackend -def _mllm() -> Type[MLLMBackend]: +def _mllm() -> Type[AnnifBackend]: from . import mllm return mllm.MLLMBackend -def _nn_ensemble() -> Type[NNEnsembleBackend]: +def _nn_ensemble() -> Type[AnnifBackend]: try: from . import nn_ensemble @@ -63,7 +52,7 @@ def _nn_ensemble() -> Type[NNEnsembleBackend]: ) -def _omikuji() -> Type[OmikujiBackend]: +def _omikuji() -> Type[AnnifBackend]: try: from . import omikuji @@ -72,13 +61,13 @@ def _omikuji() -> Type[OmikujiBackend]: raise ValueError("Omikuji not available, cannot use omikuji backend") -def _pav() -> Type[PAVBackend]: +def _pav() -> Type[AnnifBackend]: from . import pav return pav.PAVBackend -def _stwfsa() -> Type[StwfsaBackend]: +def _stwfsa() -> Type[AnnifBackend]: try: from . import stwfsa @@ -87,19 +76,19 @@ def _stwfsa() -> Type[StwfsaBackend]: raise ValueError("STWFSA not available, cannot use stwfsa backend") -def _svc() -> Type[SVCBackend]: +def _svc() -> Type[AnnifBackend]: from . import svc return svc.SVCBackend -def _tfidf() -> Type[TFIDFBackend]: +def _tfidf() -> Type[AnnifBackend]: from . import tfidf return tfidf.TFIDFBackend -def _yake() -> Type[YakeBackend]: +def _yake() -> Type[AnnifBackend]: try: from . import yake diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 8fd40257d..95036e14a 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -13,13 +13,7 @@ if TYPE_CHECKING: from unittest.mock import Mock - from annif.corpus.document import ( - DocumentDirectory, - DocumentFile, - DocumentList, - LimitingDocumentCorpus, - TransformingDocumentCorpus, - ) + from annif.corpus.document import DocumentCorpus from annif.project import AnnifProject @@ -80,7 +74,7 @@ def _get_backend_params( def _train( self, - corpus: TransformingDocumentCorpus, + corpus: DocumentCorpus, params: Dict[str, Union[int, str]], jobs: int = 0, ) -> None: @@ -90,13 +84,7 @@ def _train( def train( self, - corpus: Union[ - str, - TransformingDocumentCorpus, - DocumentList, - DocumentFile, - DocumentDirectory, - ], + corpus: DocumentCorpus, params: Optional[Union[Dict[str, Union[float, int]], Dict[str, int]]] = None, jobs: int = 0, ) -> None: @@ -164,12 +152,7 @@ def _learn(self, corpus, params): def learn( self, - corpus: Union[ - DocumentDirectory, - TransformingDocumentCorpus, - LimitingDocumentCorpus, - DocumentFile, - ], + corpus: DocumentCorpus, params: Optional[Dict[str, int]] = None, ) -> None: """Further train the model on the given document or subject corpus.""" diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index 4d8bc9f6e..fd6465ddb 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -8,7 +8,7 @@ from . import backend if TYPE_CHECKING: - from annif.corpus.document import DocumentDirectory, TransformingDocumentCorpus + from annif.corpus.document import DocumentCorpus class DummyBackend(backend.AnnifLearningBackend): @@ -46,7 +46,7 @@ def _suggest( def _learn( self, - corpus: Union[TransformingDocumentCorpus, DocumentDirectory], + corpus: DocumentCorpus, params: Dict[str, Union[int, str]], ) -> None: # in this dummy backend we "learn" by picking up the subject ID diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index f097563b3..d5f78fd72 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -16,7 +16,7 @@ from optuna.trial._trial import Trial from annif.backend.hyperopt import HPRecommendation - from annif.corpus.document import DocumentDirectory, DocumentFile + from annif.corpus.document import DocumentCorpus class BaseEnsembleBackend(backend.AnnifBackend): @@ -74,7 +74,7 @@ class EnsembleOptimizer(hyperopt.HyperparameterOptimizer): """Hyperparameter optimizer for the ensemble backend""" def __init__( - self, backend: EnsembleBackend, corpus: DocumentDirectory, metric: str + self, backend: EnsembleBackend, corpus: DocumentCorpus, metric: str ) -> None: super().__init__(backend, corpus, metric) self._sources = [ @@ -155,11 +155,11 @@ def modification_time(self) -> None: return max(filter(None, mtimes), default=None) def get_hp_optimizer( - self, corpus: DocumentDirectory, metric: str + self, corpus: DocumentCorpus, metric: str ) -> EnsembleOptimizer: return EnsembleOptimizer(self, corpus, metric) def _train( - self, corpus: DocumentFile, params: Dict[str, Union[int, str]], jobs: int = 0 + self, corpus: DocumentCorpus, params: Dict[str, Union[int, str]], jobs: int = 0 ): raise NotSupportedException("Training ensemble backend is not possible.") diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index 06a233ff2..c549a06cf 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -17,7 +17,7 @@ from fasttext.FastText import _FastText from numpy import ndarray - from annif.corpus.document import DocumentFile, TransformingDocumentCorpus + from annif.corpus.document import DocumentCorpus class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): @@ -94,9 +94,7 @@ def _label_to_subject_id(self, label: str) -> int: labelnum = label.replace("__label__", "") return int(labelnum) - def _write_train_file( - self, corpus: Union[TransformingDocumentCorpus, DocumentFile], filename: str - ) -> None: + def _write_train_file(self, corpus: DocumentCorpus, filename: str) -> None: with open(filename, "w", encoding="utf-8") as trainfile: for doc in corpus.documents: text = self._normalize_text(doc.text) @@ -112,7 +110,8 @@ def _normalize_text(self, text: str) -> str: return " ".join(self.project.analyzer.tokenize_words(text)) def _create_train_file( - self, corpus: Union[TransformingDocumentCorpus, DocumentFile] + self, + corpus: DocumentCorpus, ) -> None: self.info("creating fastText training file") @@ -139,7 +138,7 @@ def _create_model( def _train( self, - corpus: Union[TransformingDocumentCorpus, DocumentFile, str], + corpus: DocumentCorpus, params: Dict[str, Union[int, float, str]], jobs: int = 0, ) -> None: diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 6f460dee1..5491e5a59 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -20,7 +20,7 @@ from optuna.trial._trial import Trial from annif.backend.hyperopt import HPRecommendation - from annif.corpus.document import DocumentDirectory, DocumentFile + from annif.corpus.document import DocumentCorpus from annif.lexical.mllm import Candidate @@ -90,7 +90,7 @@ class MLLMBackend(hyperopt.AnnifHyperoptBackend): "use_hidden_labels": False, } - def get_hp_optimizer(self, corpus: DocumentDirectory, metric: str) -> MLLMOptimizer: + def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> MLLMOptimizer: return MLLMOptimizer(self, corpus, metric) def default_params(self) -> Dict[str, Union[int, float, bool]]: @@ -123,7 +123,7 @@ def initialize(self, parallel: bool = False) -> None: def _train( self, - corpus: Union[DocumentFile, str, DocumentDirectory], + corpus: DocumentCorpus, params: Dict[str, Union[int, float, bool, str]], jobs: int = 0, ) -> None: diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 5ef2a857f..a8de4ac0b 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -27,7 +27,7 @@ if TYPE_CHECKING: from tensorflow.python.framework.ops import EagerTensor - from annif.corpus.document import DocumentFile, LimitingDocumentCorpus + from annif.corpus.document import DocumentCorpus def idx_to_key(idx: int) -> bytes: @@ -198,7 +198,7 @@ def _create_model(self, sources: List[Union[Tuple[str, float], str]]) -> None: def _train( self, - corpus: Union[DocumentFile, str], + corpus: DocumentCorpus, params: Dict[str, Union[int, float, str]], jobs: int = 0, ) -> None: @@ -213,7 +213,7 @@ def _train( def _corpus_to_vectors( self, - corpus: Union[LimitingDocumentCorpus, DocumentFile], + corpus: DocumentCorpus, seq: LMDBSequence, n_jobs: int, ) -> None: @@ -259,7 +259,7 @@ def _open_lmdb(self, cached, lmdb_map_size): def _fit_model( self, - corpus: Union[LimitingDocumentCorpus, DocumentFile, str], + corpus: DocumentCorpus, epochs: int, lmdb_map_size: int, n_jobs: int = 1, @@ -285,7 +285,7 @@ def _fit_model( def _learn( self, - corpus: Union[LimitingDocumentCorpus, DocumentFile], + corpus: DocumentCorpus, params: Dict[str, Union[int, float, str]], ) -> None: self.initialize() diff --git a/annif/backend/omikuji.py b/annif/backend/omikuji.py index 1c2c51645..e60467d85 100644 --- a/annif/backend/omikuji.py +++ b/annif/backend/omikuji.py @@ -20,7 +20,7 @@ if TYPE_CHECKING: from scipy.sparse._csr import csr_matrix - from annif.corpus.document import DocumentFile + from annif.corpus.document import DocumentCorpus class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): @@ -69,7 +69,7 @@ def initialize(self, parallel: bool = False) -> None: self.initialize_vectorizer() self._initialize_model() - def _create_train_file(self, veccorpus: csr_matrix, corpus: DocumentFile) -> None: + def _create_train_file(self, veccorpus: csr_matrix, corpus: DocumentCorpus) -> None: self.info("creating train file") path = os.path.join(self.datadir, self.TRAIN_FILE) with open(path, "w", encoding="utf-8") as trainfile: @@ -113,7 +113,7 @@ def _create_model(self, params: Dict[str, Union[int, bool]], jobs: int) -> None: def _train( self, - corpus: Union[DocumentFile, str], + corpus: DocumentCorpus, params: Dict[str, Union[int, bool]], jobs: int = 0, ) -> None: diff --git a/annif/backend/pav.py b/annif/backend/pav.py index dc35eb5fb..dba85f62e 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -20,7 +20,7 @@ from . import backend, ensemble if TYPE_CHECKING: - from annif.corpus.document import DocumentFile + from annif.corpus.document import DocumentCorpus from annif.project import AnnifProject @@ -94,7 +94,7 @@ def _merge_source_batches( @staticmethod def _suggest_train_corpus( - source_project: AnnifProject, corpus: DocumentFile + source_project: AnnifProject, corpus: DocumentCorpus ) -> Tuple[csc_matrix, csc_matrix]: # lists for constructing score matrix data, row, col = [], [], [] @@ -128,7 +128,7 @@ def _suggest_train_corpus( return csc_matrix(scores), csc_matrix(true) def _create_pav_model( - self, source_project_id: str, min_docs: int, corpus: DocumentFile + self, source_project_id: str, min_docs: int, corpus: DocumentCorpus ) -> None: self.info( "creating PAV model for source {}, min_docs={}".format( @@ -155,7 +155,7 @@ def _create_pav_model( def _train( self, - corpus: Union[str, DocumentFile], + corpus: DocumentCorpus, params: Dict[str, Union[int, str]], jobs: int = 0, ) -> None: diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index 688a03112..d8818ca16 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -12,7 +12,7 @@ from . import backend if TYPE_CHECKING: - from annif.corpus.document import DocumentFile, DocumentList + from annif.corpus.document import DocumentCorpus _KEY_CONCEPT_TYPE_URI = "concept_type_uri" _KEY_SUBTHESAURUS_TYPE_URI = "sub_thesaurus_type_uri" @@ -78,7 +78,7 @@ def initialize(self, parallel: bool = False) -> None: ) def _load_data( - self, corpus: Union[DocumentList, DocumentFile, str] + self, corpus: DocumentCorpus ) -> Tuple[List[str], List[List[Union[str, Any]]]]: if corpus == "cached": raise NotSupportedException( @@ -103,7 +103,7 @@ def _load_data( def _train( self, - corpus: Union[DocumentList, DocumentFile, str], + corpus: DocumentCorpus, params: Dict[str, Union[str, bool, int]], jobs: int = 0, ) -> None: diff --git a/annif/backend/svc.py b/annif/backend/svc.py index cd5014f35..34f989a6e 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Tuple import joblib import numpy as np @@ -18,7 +18,7 @@ if TYPE_CHECKING: from scipy.sparse._csr import csr_matrix - from annif.corpus.document import DocumentFile + from annif.corpus.document import DocumentCorpus class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): @@ -54,7 +54,7 @@ def initialize(self, parallel: bool = False) -> None: self._initialize_model() def _corpus_to_texts_and_classes( - self, corpus: DocumentFile + self, corpus: DocumentCorpus ) -> Tuple[List[str], List[int]]: texts = [] classes = [] @@ -79,7 +79,7 @@ def _train_classifier(self, veccorpus: csr_matrix, classes: List[int]) -> None: ) def _train( - self, corpus: Union[DocumentFile, str], params: Dict[str, int], jobs: int = 0 + self, corpus: DocumentCorpus, params: Dict[str, int], jobs: int = 0 ) -> None: if corpus == "cached": raise NotSupportedException( diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 76d0b5622..bb566287d 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -18,7 +18,7 @@ if TYPE_CHECKING: from scipy.sparse._csr import csr_matrix - from annif.corpus.document import DocumentFile, TransformingDocumentCorpus + from annif.corpus.document import DocumentCorpus class SubjectBuffer: @@ -70,7 +70,7 @@ class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): INDEX_FILE = "tfidf-index" def _generate_subjects_from_documents( - self, corpus: Union[TransformingDocumentCorpus, DocumentFile] + self, corpus: DocumentCorpus ) -> Iterator[str]: with tempfile.TemporaryDirectory() as tempdir: subject_buffer = {} @@ -111,7 +111,7 @@ def _create_index(self, veccorpus: csr_matrix) -> None: def _train( self, - corpus: Union[TransformingDocumentCorpus, DocumentFile, str], + corpus: DocumentCorpus, params: Dict[str, Union[str, int]], jobs: int = 0, ) -> None: diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 80da1da7e..7f8785b48 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -24,7 +24,7 @@ from numpy import float64 from rdflib.term import URIRef - from annif.corpus.document import DocumentFile + from annif.corpus.document import DocumentCorpus class YakeBackend(backend.AnnifBackend): @@ -196,5 +196,5 @@ def _combine_scores(self, score1: float, score2: float) -> float: confl = score1 * score2 / (score1 * score2 + (1 - score1) * (1 - score2)) return (confl - 0.5) * 2 - def _train(self, corpus: DocumentFile, params: Dict[str, Any], jobs: int = 0): + def _train(self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0): raise NotSupportedException("Training yake backend is not possible.") diff --git a/annif/cli_util.py b/annif/cli_util.py index 7e75ff04f..be1c07690 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -21,13 +21,7 @@ from click.core import Argument, Context, Option - from annif.corpus.combine import CombinedCorpus - from annif.corpus.document import ( - DocumentDirectory, - DocumentFile, - DocumentList, - LimitingDocumentCorpus, - ) + from annif.corpus.document import DocumentCorpus, DocumentList from annif.corpus.subject import SubjectIndex from annif.project import AnnifProject from annif.suggestion import SuggestionResult @@ -138,7 +132,7 @@ def open_documents( subject_index: SubjectIndex, vocab_lang: str, docs_limit: Optional[int], -) -> Union[LimitingDocumentCorpus, DocumentDirectory, CombinedCorpus, DocumentFile]: +) -> DocumentCorpus: """Helper function to open a document corpus from a list of pathnames, each of which is either a TSV file or a directory of TXT files. For directories with subjects in TSV files, the given vocabulary language diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index a873bd15d..42580a395 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -14,8 +14,6 @@ if TYPE_CHECKING: import numpy as np - from annif.corpus.skos import SubjectFileSKOS - logger = annif.logger.getChild("subject") logger.addFilter(annif.util.DuplicateFilter()) @@ -121,9 +119,7 @@ def __init__(self) -> None: self._label_idx = {} self._languages = None - def load_subjects( - self, corpus: Union[SubjectFileSKOS, SubjectFileCSV, SubjectFileTSV] - ) -> None: + def load_subjects(self, corpus: SubjectCorpus) -> None: """Initialize the subject index from a subject corpus""" self._languages = corpus.languages diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index b593dc261..ab92277ff 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -28,8 +28,8 @@ from rdflib.graph import Graph from rdflib.term import URIRef - from annif.analyzer.snowball import SnowballAnalyzer - from annif.corpus.document import DocumentDirectory + from annif.analyzer import Analyzer + from annif.corpus.document import DocumentCorpus from annif.vocab import AnnifVocabulary Term = collections.namedtuple("Term", "subject_id label is_pref") @@ -79,7 +79,7 @@ def conflate_matches( def generate_candidates( text: str, - analyzer: SnowballAnalyzer, + analyzer: Analyzer, vectorizer: CountVectorizer, index: TokenSetIndex, ) -> List[Union[Candidate, Any]]: @@ -153,7 +153,7 @@ class MLLMModel: """Maui-like Lexical Matching model""" def generate_candidates( - self, text: str, analyzer: SnowballAnalyzer + self, text: str, analyzer: Analyzer ) -> List[Union[Candidate, Any]]: return generate_candidates(text, analyzer, self._vectorizer, self._index) @@ -219,7 +219,7 @@ def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None: def _prepare_train_index( self, vocab: AnnifVocabulary, - analyzer: SnowballAnalyzer, + analyzer: Analyzer, params: Dict[str, Union[int, float, bool, str]], ) -> List[int]: graph = vocab.as_graph() @@ -245,7 +245,7 @@ def _prepare_train_index( return subject_ids def _prepare_train_data( - self, corpus: DocumentDirectory, analyzer: SnowballAnalyzer, n_jobs: int + self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int ) -> Tuple[List[List[Union[Candidate, Any]]], List[bool]]: # frequency of subjects (by id) in the generated candidates self._doc_freq = collections.Counter() @@ -302,9 +302,9 @@ def _prepare_features( def prepare_train( self, - corpus: DocumentDirectory, + corpus: DocumentCorpus, vocab: AnnifVocabulary, - analyzer: SnowballAnalyzer, + analyzer: Analyzer, params: Dict[str, Union[int, float, bool, str]], n_jobs: int, ) -> Tuple[np.ndarray, np.ndarray]: diff --git a/annif/project.py b/annif/project.py index fa482c67f..02e921dbc 100644 --- a/annif/project.py +++ b/annif/project.py @@ -25,20 +25,10 @@ from click.utils import LazyFile - from annif.analyzer.snowball import SnowballAnalyzer - from annif.backend.dummy import DummyBackend - from annif.backend.ensemble import EnsembleBackend - from annif.backend.fasttext import FastTextBackend + from annif.analyzer import Analyzer + from annif.backend import AnnifBackend from annif.backend.hyperopt import HPRecommendation - from annif.backend.pav import PAVBackend - from annif.backend.tfidf import TFIDFBackend - from annif.corpus.combine import CombinedCorpus - from annif.corpus.document import ( - DocumentDirectory, - DocumentFile, - DocumentList, - LimitingDocumentCorpus, - ) + from annif.corpus.document import DocumentCorpus from annif.corpus.subject import SubjectIndex from annif.registry import AnnifRegistry from annif.transform.transform import TransformChain @@ -152,7 +142,7 @@ def _suggest_with_backend( return self.backend.suggest(texts, beparams) @property - def analyzer(self) -> SnowballAnalyzer: + def analyzer(self) -> Analyzer: if self._analyzer is None: if self.analyzer_spec: self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec) @@ -171,11 +161,7 @@ def transform(self) -> TransformChain: return self._transform @property - def backend( - self, - ) -> Union[ - DummyBackend, EnsembleBackend, PAVBackend, TFIDFBackend, FastTextBackend - ]: + def backend(self) -> AnnifBackend: if self._backend is None: if "backend" not in self.config: raise ConfigurationException( @@ -239,7 +225,7 @@ def modification_time(self) -> Optional[datetime]: def suggest_corpus( self, - corpus: Union[DocumentDirectory, DocumentList], + corpus: DocumentCorpus, backend_params: None = None, ) -> annif.suggestion.SuggestionResults: """Suggest subjects for the given documents corpus in batches of documents.""" @@ -266,7 +252,7 @@ def suggest( def train( self, - corpus: Union[CombinedCorpus, LimitingDocumentCorpus, DocumentFile, str], + corpus: DocumentCorpus, backend_params: None = None, jobs: int = 0, ) -> None: @@ -280,7 +266,7 @@ def train( def learn( self, - corpus: Union[DocumentDirectory, DocumentFile, DocumentList], + corpus: DocumentCorpus, backend_params: None = None, ) -> None: """further train the project using documents from a metadata source""" @@ -297,7 +283,7 @@ def learn( def hyperopt( self, - corpus: DocumentDirectory, + corpus: DocumentCorpus, trials: int, jobs: int, metric: str, diff --git a/annif/transform/transform.py b/annif/transform/transform.py index 3947eade0..111af40df 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -10,10 +10,8 @@ if TYPE_CHECKING: from unittest.mock import Mock - from annif.corpus import DocumentCorpus + from annif.corpus.types import DocumentCorpus from annif.project import AnnifProject - from annif.transform.inputlimiter import InputLimiter - from annif.transform.langfilter import LangFilter class BaseTransform(metaclass=abc.ABCMeta): @@ -46,9 +44,7 @@ class TransformChain: def __init__( self, - transform_classes: List[ - Union[Type[InputLimiter], Type[IdentityTransform], Type[LangFilter]] - ], + transform_classes: List[Type[BaseTransform]], args: List[ Union[ Tuple[List[Any], Dict[str, str]], @@ -63,9 +59,7 @@ def __init__( def _init_transforms( self, - transform_classes: List[ - Union[Type[InputLimiter], Type[IdentityTransform], Type[LangFilter]] - ], + transform_classes: List[Type[BaseTransform]], args: List[ Union[ Tuple[List[Any], Dict[str, str]], @@ -73,7 +67,7 @@ def _init_transforms( Tuple[List[Any], Dict[Any, Any]], ] ], - ) -> List[Union[InputLimiter, IdentityTransform, LangFilter]]: + ) -> List[Type[BaseTransform]]: transforms = [] for trans, (posargs, kwargs) in zip(transform_classes, args): try: diff --git a/annif/vocab.py b/annif/vocab.py index d63f0ca7d..b33550920 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, List import annif import annif.corpus @@ -14,7 +14,8 @@ from rdflib.graph import Graph from annif.corpus.skos import SubjectFileSKOS - from annif.corpus.subject import SubjectFileCSV, SubjectFileTSV, SubjectIndex + from annif.corpus.subject import SubjectCorpus, SubjectIndex + logger = annif.logger @@ -36,15 +37,13 @@ def __init__(self, vocab_id: str, datadir: str) -> None: self.vocab_id = vocab_id self._skos_vocab = None - def _create_subject_index( - self, subject_corpus: Union[SubjectFileCSV, SubjectFileTSV, SubjectFileSKOS] - ) -> SubjectIndex: + def _create_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex: subjects = annif.corpus.SubjectIndex() subjects.load_subjects(subject_corpus) annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV) return subjects - def _update_subject_index(self, subject_corpus: SubjectFileTSV) -> SubjectIndex: + def _update_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex: old_subjects = self.subjects new_subjects = annif.corpus.SubjectIndex() new_subjects.load_subjects(subject_corpus) @@ -113,7 +112,7 @@ def languages(self) -> List[str]: def load_vocabulary( self, - subject_corpus: Union[SubjectFileCSV, SubjectFileTSV, SubjectFileSKOS], + subject_corpus: SubjectCorpus, force: bool = False, ) -> None: """Load subjects from a subject corpus and save them into one From ba77e843107040dc43bd9e8c7f6bd475cbe4847e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 24 May 2023 10:31:19 +0300 Subject: [PATCH 07/28] Remove "Union[Any," in hints These would allow all types --- annif/analyzer/analyzer.py | 6 +++--- annif/backend/http.py | 4 ++-- annif/backend/yake.py | 8 ++++---- annif/corpus/subject.py | 2 +- annif/lexical/mllm.py | 4 ++-- annif/lexical/tokenset.py | 4 ++-- annif/lexical/util.py | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 00657a094..137a5db18 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -4,7 +4,7 @@ import abc import functools import unicodedata -from typing import Any, List, Union +from typing import List _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -21,7 +21,7 @@ def __init__(self, **kwargs) -> None: if _KEY_TOKEN_MIN_LENGTH in kwargs: self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH]) - def tokenize_sentences(self, text: str) -> List[Union[Any, str]]: + def tokenize_sentences(self, text: str) -> List[str]: """Tokenize a piece of text (e.g. a document) into sentences.""" import nltk.tokenize @@ -38,7 +38,7 @@ def is_valid_token(self, word: str) -> bool: return True return False - def tokenize_words(self, text: str, filter: bool = True) -> List[Union[Any, str]]: + def tokenize_words(self, text: str, filter: bool = True) -> List[str]: """Tokenize a piece of text (e.g. a sentence) into words. If filter=True (default), only return valid tokens (e.g. not punctuation, numbers or very short words)""" diff --git a/annif/backend/http.py b/annif/backend/http.py index 8f26abe1b..59a729b89 100644 --- a/annif/backend/http.py +++ b/annif/backend/http.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union import dateutil.parser import requests @@ -65,7 +65,7 @@ def _get_project_info(self, key: str) -> Optional[Union[bool, str]]: def _suggest( self, text: str, params: Dict[str, Union[int, str]] - ) -> List[Union[Any, SubjectSuggestion]]: + ) -> List[SubjectSuggestion]: data = {"text": text} if "project" in params: data["project"] = params["project"] diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 7f8785b48..ea09f21cd 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -145,8 +145,8 @@ def _suggest( return subject_suggestions def _keyphrases2suggestions( - self, keyphrases: List[Union[Any, Tuple[str, float64]]] - ) -> List[Union[Any, Tuple[str, float64]]]: + self, keyphrases: List[Tuple[str, float64]] + ) -> List[Tuple[str, float64]]: suggestions = [] not_matched = [] for kp, score in keyphrases: @@ -178,8 +178,8 @@ def _transform_score(self, score: float64) -> float64: return 1.0 / (score + 1) def _combine_suggestions( - self, suggestions: List[Union[Any, Tuple[str, float], Tuple[str, float64]]] - ) -> List[Union[Any, Tuple[str, float], Tuple[str, float64]]]: + self, suggestions: List[Tuple[str, float], Tuple[str, float64]] + ) -> List[Tuple[str, float], Tuple[str, float64]]: combined_suggestions = {} for uri, score in suggestions: if uri not in combined_suggestions: diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 42580a395..1d2002027 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -169,7 +169,7 @@ def by_label(self, label: Optional[str], language: str) -> Optional[int]: logger.warning('Unknown subject label "%s"@%s', label, language) return None - def deprecated_ids(self) -> List[Union[Any, int]]: + def deprecated_ids(self) -> List[int]: """return indices of deprecated subjects""" return [ diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index ab92277ff..43fd1ffe8 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -56,7 +56,7 @@ def conflate_matches( - matches: List[Union[Any, Match]], doc_length: int + matches: List[Match], doc_length: int ) -> List[Union[Candidate, Any]]: subj_matches = collections.defaultdict(list) for match in matches: @@ -359,7 +359,7 @@ def _prediction_to_list( def predict( self, candidates: List[Union[Candidate, Any]] - ) -> List[Union[Any, Tuple[np.float64, int]]]: + ) -> List[Tuple[np.float64, int]]: if not candidates: return [] features = self._candidates_to_features(candidates) diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index e7499051c..42a11a4d4 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -2,7 +2,7 @@ from __future__ import annotations import collections -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: from numpy import int32, ndarray @@ -87,7 +87,7 @@ def _find_subj_ambiguity(self, tsets): return subj_ambiguity - def search(self, tset: TokenSet) -> List[Union[Any, Tuple[TokenSet, int]]]: + def search(self, tset: TokenSet) -> List[Tuple[TokenSet, int]]: """Return the TokenSets that are contained in the given TokenSet. The matches are returned as a list of (TokenSet, ambiguity) pairs where ambiguity is an integer indicating the number of other TokenSets diff --git a/annif/lexical/util.py b/annif/lexical/util.py index a2c6110e6..0195ce05b 100644 --- a/annif/lexical/util.py +++ b/annif/lexical/util.py @@ -2,7 +2,7 @@ from __future__ import annotations import collections -from typing import TYPE_CHECKING, Any, List, Union +from typing import TYPE_CHECKING, List from rdflib import URIRef from rdflib.namespace import SKOS @@ -16,7 +16,7 @@ def get_subject_labels( graph: Graph, uri: str, properties: List[URIRef], language: str -) -> List[Union[Any, str]]: +) -> List[str]: return [ str(label) for prop in properties From 2f9ee1077d923e883a8674e3a49cae5624d1c98e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 24 May 2023 14:40:34 +0300 Subject: [PATCH 08/28] Move comment back to its original place --- annif/backend/yake.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index ea09f21cd..4989a4805 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -1,4 +1,6 @@ """Annif backend using Yake keyword extraction""" +# For license remarks of this backend see README.md: +# https://github.com/NatLibFi/Annif#license. from __future__ import annotations import os.path @@ -16,10 +18,6 @@ from . import backend -# For license remarks of this backend see README.md: -# https://github.com/NatLibFi/Annif#license. - - if TYPE_CHECKING: from numpy import float64 from rdflib.term import URIRef From d2c5e53521b1bd7796b0c99888581f0a479b3c9c Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 24 May 2023 15:29:15 +0300 Subject: [PATCH 09/28] Fix some omissions and errors by monkeytype --- annif/__init__.py | 2 +- annif/backend/backend.py | 8 +++----- annif/backend/nn_ensemble.py | 2 +- annif/corpus/subject.py | 4 ++-- annif/exception.py | 3 +-- annif/project.py | 6 +++--- annif/transform/__init__.py | 4 +--- annif/transform/inputlimiter.py | 6 ++---- annif/transform/langfilter.py | 4 ++-- annif/transform/transform.py | 6 ++---- 10 files changed, 18 insertions(+), 27 deletions(-) diff --git a/annif/__init__.py b/annif/__init__.py index 6258afb0a..f239f85bb 100644 --- a/annif/__init__.py +++ b/annif/__init__.py @@ -18,7 +18,7 @@ from flask.app import Flask -def create_flask_app(config_name: None = None) -> Flask: +def create_flask_app(config_name: Optional[str] = None) -> Flask: """Create a Flask app to be used by the CLI.""" from flask import Flask diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 95036e14a..152b57deb 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -11,8 +11,6 @@ from annif.suggestion import SuggestionBatch if TYPE_CHECKING: - from unittest.mock import Mock - from annif.corpus.document import DocumentCorpus from annif.project import AnnifProject @@ -26,7 +24,7 @@ class AnnifBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"limit": 100} def __init__( - self, backend_id: str, config_params: Any, project: Union[Mock, AnnifProject] + self, backend_id: str, config_params: Any, project: AnnifProject ) -> None: """Initialize backend with specific parameters. The parameters are a dict. Keys and values depend on the specific @@ -106,7 +104,7 @@ def _suggest(self, text, params): pass # pragma: no cover def _suggest_batch( - self, texts: Union[str, List[str]], params: Dict[str, Any] + self, texts: List[str], params: Dict[str, Any] ) -> SuggestionBatch: """This method can be implemented by backends to use batching of documents in their operations. This default implementation uses the regular suggest @@ -119,7 +117,7 @@ def _suggest_batch( def suggest( self, - texts: Union[str, List[str]], + texts: List[str], params: Optional[Union[Dict[str, str], Dict[str, int]]] = None, ) -> SuggestionBatch: """Suggest subjects for the input documents and return a list of subject sets diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index a8de4ac0b..1ab8b35b6 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -164,7 +164,7 @@ def _merge_source_batches( self.project.subjects, ) - def _create_model(self, sources: List[Union[Tuple[str, float], str]]) -> None: + def _create_model(self, sources: List[Tuple[str, float]]) -> None: self.info("creating NN ensemble model") inputs = Input(shape=(len(self.project.subjects), len(sources))) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 1d2002027..49cb88f7d 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -3,7 +3,7 @@ import csv import os.path -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union import annif import annif.util @@ -238,7 +238,7 @@ def __getitem__(self, idx: int) -> int: def __bool__(self) -> bool: return bool(self._subject_ids) - def __eq__(self, other: Union[SubjectSet, List[int], Set[int]]) -> bool: + def __eq__(self, other: SubjectSet) -> bool: if isinstance(other, SubjectSet): return self._subject_ids == other._subject_ids diff --git a/annif/exception.py b/annif/exception.py index 8fd5a06dc..64d1daaf6 100644 --- a/annif/exception.py +++ b/annif/exception.py @@ -7,7 +7,6 @@ if TYPE_CHECKING: from configparser import DuplicateSectionError - from unittest.mock import Mock class AnnifException(ClickException): @@ -18,7 +17,7 @@ class AnnifException(ClickException): def __init__( self, message: Union[DuplicateSectionError, str], - project_id: Optional[Union[Mock, str]] = None, + project_id: Optional[str] = None, backend_id: Optional[str] = None, ) -> None: super().__init__(message) diff --git a/annif/project.py b/annif/project.py index 02e921dbc..722e46c9b 100644 --- a/annif/project.py +++ b/annif/project.py @@ -226,7 +226,7 @@ def modification_time(self) -> Optional[datetime]: def suggest_corpus( self, corpus: DocumentCorpus, - backend_params: None = None, + backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, ) -> annif.suggestion.SuggestionResults: """Suggest subjects for the given documents corpus in batches of documents.""" suggestions = ( @@ -253,7 +253,7 @@ def suggest( def train( self, corpus: DocumentCorpus, - backend_params: None = None, + backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, jobs: int = 0, ) -> None: """train the project using documents from a metadata source""" @@ -267,7 +267,7 @@ def train( def learn( self, corpus: DocumentCorpus, - backend_params: None = None, + backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, ) -> None: """further train the project using documents from a metadata source""" if backend_params is None: diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index bda282bed..d25af5b3b 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -11,8 +11,6 @@ from . import inputlimiter, transform if TYPE_CHECKING: - from unittest.mock import Mock - from annif.project import AnnifProject from annif.transform.transform import TransformChain @@ -45,7 +43,7 @@ def parse_specs( def get_transform( - transform_specs: str, project: Optional[Union[AnnifProject, Mock]] + transform_specs: str, project: Optional[AnnifProject] ) -> TransformChain: transform_defs = parse_specs(transform_specs) transform_classes = [] diff --git a/annif/transform/inputlimiter.py b/annif/transform/inputlimiter.py index 6b8c3e9e1..7729e7ae9 100644 --- a/annif/transform/inputlimiter.py +++ b/annif/transform/inputlimiter.py @@ -2,15 +2,13 @@ given character length.""" from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional from annif.exception import ConfigurationException from . import transform if TYPE_CHECKING: - from unittest.mock import Mock - from annif.project import AnnifProject @@ -18,7 +16,7 @@ class InputLimiter(transform.BaseTransform): name = "limit" def __init__( - self, project: Optional[Union[Mock, AnnifProject]], input_limit: str + self, project: Optional[AnnifProject], input_limit: str ) -> None: super().__init__(project) self.input_limit = int(input_limit) diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index 3e63ee6d0..018ea3996 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -11,7 +11,7 @@ from . import transform if TYPE_CHECKING: - from unittest.mock import Mock + from annif.project import AnnifProject logger = annif.logger @@ -21,7 +21,7 @@ class LangFilter(transform.BaseTransform): def __init__( self, - project: Mock, + project: AnnifProject, text_min_length: Union[int, str] = 500, sentence_min_length: Union[int, str] = 50, min_ratio: float = 0.5, diff --git a/annif/transform/transform.py b/annif/transform/transform.py index 111af40df..caa4fc9c0 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -8,8 +8,6 @@ from annif.exception import ConfigurationException if TYPE_CHECKING: - from unittest.mock import Mock - from annif.corpus.types import DocumentCorpus from annif.project import AnnifProject @@ -20,7 +18,7 @@ class BaseTransform(metaclass=abc.ABCMeta): name = None - def __init__(self, project: Optional[Union[AnnifProject, Mock]]) -> None: + def __init__(self, project: Optional[AnnifProject]) -> None: self.project = project @abc.abstractmethod @@ -52,7 +50,7 @@ def __init__( Tuple[List[Any], Dict[Any, Any]], ] ], - project: Optional[Union[AnnifProject, Mock]], + project: Optional[AnnifProject], ) -> None: self.project = project self.transforms = self._init_transforms(transform_classes, args) From f242a98d3712c4ed25544d9f697dce216bb14dbf Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 24 May 2023 16:29:24 +0300 Subject: [PATCH 10/28] Simplify hints using float for Union[int, float] --- annif/backend/backend.py | 6 ++---- annif/backend/ensemble.py | 2 +- annif/backend/fasttext.py | 10 ++++------ annif/backend/mllm.py | 8 ++++---- annif/backend/nn_ensemble.py | 8 ++++---- annif/eval.py | 2 +- annif/lexical/mllm.py | 12 ++++++------ annif/rest.py | 4 ++-- annif/suggestion.py | 6 +++--- annif/transform/inputlimiter.py | 4 +--- 10 files changed, 28 insertions(+), 34 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 152b57deb..ea18479db 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -61,9 +61,7 @@ def modification_time(self) -> Optional[datetime.datetime]: def _get_backend_params( self, - params: Optional[ - Union[Dict[str, str], Dict[str, int], Dict[str, Union[float, int]]] - ], + params: Optional[Union[Dict[str, str], Dict[str, int], Dict[str, float]]], ) -> Dict[str, Any]: backend_params = dict(self.params) if params is not None: @@ -83,7 +81,7 @@ def _train( def train( self, corpus: DocumentCorpus, - params: Optional[Union[Dict[str, Union[float, int]], Dict[str, int]]] = None, + params: Optional[Union[Dict[str, float], Dict[str, int]]] = None, jobs: int = 0, ) -> None: """Train the model on the given document or subject corpus.""" diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index d5f78fd72..08dac5b2a 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -63,7 +63,7 @@ def _merge_source_batches( ) def _suggest_batch( - self, texts: List[str], params: Dict[str, Union[int, float, str]] + self, texts: List[str], params: Dict[str, Union[float, str]] ) -> SuggestionBatch: sources = annif.util.parse_sources(params["sources"]) batch_by_source = self._suggest_with_sources(texts, sources) diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index c549a06cf..740162bc8 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -56,7 +56,7 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Union[int, float, str]]: + def default_params(self) -> Dict[str, Union[float, str]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(mixins.ChunkingBackend.DEFAULT_PARAMETERS) params.update(self.DEFAULT_PARAMETERS) @@ -119,9 +119,7 @@ def _create_train_file( corpus, self.datadir, self.TRAIN_FILE, method=self._write_train_file ) - def _create_model( - self, params: Dict[str, Union[int, float, str]], jobs: int - ) -> None: + def _create_model(self, params: Dict[str, Union[float, str]], jobs: int) -> None: self.info("creating fastText model") trainpath = os.path.join(self.datadir, self.TRAIN_FILE) modelpath = os.path.join(self.datadir, self.MODEL_FILE) @@ -139,7 +137,7 @@ def _create_model( def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, float, str]], + params: Dict[str, Union[float, str]], jobs: int = 0, ) -> None: if corpus != "cached": @@ -165,7 +163,7 @@ def _predict_chunks( ) def _suggest_chunks( - self, chunktexts: List[str], params: Dict[str, Union[int, float, str]] + self, chunktexts: List[str], params: Dict[str, Union[float, str]] ) -> List[SubjectSuggestion]: limit = int(params["limit"]) chunklabels, chunkscores = self._predict_chunks(chunktexts, limit) diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 5491e5a59..b8d530e64 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -93,7 +93,7 @@ class MLLMBackend(hyperopt.AnnifHyperoptBackend): def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> MLLMOptimizer: return MLLMOptimizer(self, corpus, metric) - def default_params(self) -> Dict[str, Union[int, float, bool]]: + def default_params(self) -> Dict[str, Union[float, bool]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -124,7 +124,7 @@ def initialize(self, parallel: bool = False) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], jobs: int = 0, ) -> None: self.info("starting train") @@ -158,7 +158,7 @@ def _generate_candidates(self, text: str) -> List[Union[Candidate, Any]]: def _prediction_to_result( self, prediction: List[Union[Tuple[np.float64, int], Any]], - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], ) -> Iterator[Any]: vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: @@ -166,7 +166,7 @@ def _prediction_to_result( return vector_to_suggestions(vector, int(params["limit"])) def _suggest( - self, text: str, params: Dict[str, Union[int, float, bool, str]] + self, text: str, params: Dict[str, Union[float, bool, str]] ) -> Iterator[Any]: candidates = self._generate_candidates(text) prediction = self._model.predict(candidates) diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 1ab8b35b6..29f2add94 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -112,7 +112,7 @@ class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBacke # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Union[int, float, str]]: + def default_params(self) -> Dict[str, Union[float, str]]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -140,7 +140,7 @@ def _merge_source_batches( self, batch_by_source: Dict[str, SuggestionBatch], sources: List[Tuple[str, float]], - params: Dict[str, Union[int, float, str]], + params: Dict[str, Union[float, str]], ) -> SuggestionBatch: src_weight = dict(sources) score_vectors = np.array( @@ -199,7 +199,7 @@ def _create_model(self, sources: List[Tuple[str, float]]) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, float, str]], + params: Dict[str, Union[float, str]], jobs: int = 0, ) -> None: sources = annif.util.parse_sources(self.params["sources"]) @@ -286,7 +286,7 @@ def _fit_model( def _learn( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, float, str]], + params: Dict[str, Union[float, str]], ) -> None: self.initialize() self._fit_model( diff --git a/annif/eval.py b/annif/eval.py index 64d61a3b8..8d0cd5c4c 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -288,7 +288,7 @@ def results( metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], results_file: Optional[Union[LazyFile, TextIOWrapper]] = None, language: Optional[str] = None, - ) -> Dict[str, Union[np.float64, float, int]]: + ) -> Dict[str, Union[np.float64, float]]: """evaluate a set of selected subjects against a gold standard using different metrics. If metrics is empty, use all available metrics. If results_file (file object) given, write results per subject to it diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 43fd1ffe8..8df5cfa9b 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -174,7 +174,7 @@ def _candidates_to_features(self, candidates: List[Candidate]) -> np.ndarray: @staticmethod def _get_label_props( - params: Dict[str, Union[int, float, bool, str]] + params: Dict[str, Union[float, bool, str]] ) -> Tuple[List[URIRef], List[URIRef]]: pref_label_props = [SKOS.prefLabel] @@ -189,7 +189,7 @@ def _prepare_terms( self, graph: Graph, vocab: AnnifVocabulary, - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], ) -> Tuple[List[Term], List[int]]: pref_label_props, nonpref_label_props = self._get_label_props(params) @@ -220,7 +220,7 @@ def _prepare_train_index( self, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], ) -> List[int]: graph = vocab.as_graph() terms, subject_ids = self._prepare_terms(graph, vocab, params) @@ -305,7 +305,7 @@ def prepare_train( corpus: DocumentCorpus, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], n_jobs: int, ) -> Tuple[np.ndarray, np.ndarray]: # create an index from the vocabulary terms @@ -323,7 +323,7 @@ def prepare_train( return (np.vstack(features), np.array(train_y)) def _create_classifier( - self, params: Dict[str, Union[int, float, bool, str]] + self, params: Dict[str, Union[float, bool, str]] ) -> BaggingClassifier: return BaggingClassifier( DecisionTreeClassifier( @@ -337,7 +337,7 @@ def train( self, train_x: Union[np.ndarray, List[Tuple[int, int]]], train_y: Union[List[bool], np.ndarray], - params: Dict[str, Union[int, float, bool, str]], + params: Dict[str, Union[float, bool, str]], ) -> None: # fit the model on the training corpus self._classifier = self._create_classifier(params) diff --git a/annif/rest.py b/annif/rest.py index f6be1f7ae..7296727e5 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -127,7 +127,7 @@ def _is_error( def suggest( - project_id: str, body: Dict[str, Union[int, float, str]] + project_id: str, body: Dict[str, Union[float, str]] ) -> Union[ Dict[str, List[Any]], Dict[str, List[Dict[str, Optional[Union[str, float]]]]], @@ -174,7 +174,7 @@ def suggest_batch( def _suggest( project_id: str, documents: List[Union[Dict[str, str], Any]], - parameters: Dict[str, Union[int, float, str]], + parameters: Dict[str, Union[float, str]], ) -> Union[ List[Dict[str, List[Any]]], List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], diff --git a/annif/suggestion.py b/annif/suggestion.py index 7e830572b..14bf0a699 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -3,7 +3,7 @@ import collections import itertools -from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union +from typing import TYPE_CHECKING, Any, Iterator, List, Optional import numpy as np from scipy.sparse import csr_array @@ -25,7 +25,7 @@ def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator[Any]: def filter_suggestion( preds: csr_array, limit: Optional[int] = None, - threshold: Union[int, float] = 0.0, + threshold: float = 0.0, ) -> csr_array: """filter a 2D sparse suggestion array (csr_array), retaining only the top K suggestions with a score above or equal to the threshold for each @@ -111,7 +111,7 @@ def from_sequence( @classmethod def from_averaged( - cls, batches: List[SuggestionBatch], weights: List[Union[int, float]] + cls, batches: List[SuggestionBatch], weights: List[float] ) -> SuggestionBatch: """Create a new SuggestionBatch where the subject scores are the weighted average of scores in several SuggestionBatches""" diff --git a/annif/transform/inputlimiter.py b/annif/transform/inputlimiter.py index 7729e7ae9..14a233350 100644 --- a/annif/transform/inputlimiter.py +++ b/annif/transform/inputlimiter.py @@ -15,9 +15,7 @@ class InputLimiter(transform.BaseTransform): name = "limit" - def __init__( - self, project: Optional[AnnifProject], input_limit: str - ) -> None: + def __init__(self, project: Optional[AnnifProject], input_limit: str) -> None: super().__init__(project) self.input_limit = int(input_limit) self._validate_value(self.input_limit) From 7c3c5dc5bc98b0a0f3995e99f9cfc57cdedac2e2 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Wed, 24 May 2023 17:01:51 +0300 Subject: [PATCH 11/28] Simplify hints using Sequence for Union[Tuple, List] --- annif/corpus/skos.py | 6 ++++-- annif/eval.py | 47 ++++---------------------------------------- 2 files changed, 8 insertions(+), 45 deletions(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 17d84e692..c64d591de 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -11,8 +11,8 @@ Dict, Iterator, List, + Sequence, Set, - Tuple, Union, ) @@ -118,7 +118,9 @@ def concepts(self) -> Iterator[URIRef]: yield concept def get_concept_labels( - self, concept: URIRef, label_types: Union[Tuple[URIRef, URIRef], List[URIRef]] + self, + concept: URIRef, + label_types: Sequence[URIRef], ) -> Union[DefaultDict[str, List[str]], DefaultDict[None, List[str]]]: """return all the labels of the given concept with the given label properties as a dict-like object where the keys are language codes diff --git a/annif/eval.py b/annif/eval.py index 8d0cd5c4c..a41e4d632 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np import scipy.sparse @@ -90,46 +90,7 @@ def evaluate_many( suggestion_batch: Union[ List[List[SubjectSuggestion]], SuggestionBatch, List[Iterator[Any]] ], - gold_subject_batch: Union[ - Tuple[SubjectSet, SubjectSet, SubjectSet], - Tuple[SubjectSet, SubjectSet, SubjectSet, SubjectSet], - Tuple[SubjectSet, SubjectSet], - Tuple[ - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - SubjectSet, - ], - List[SubjectSet], - ], + gold_subject_batch: Sequence[SubjectSet], ) -> None: if not isinstance(suggestion_batch, SuggestionBatch): suggestion_batch = SuggestionBatch.from_sequence( @@ -150,7 +111,7 @@ def _evaluate_samples( self, y_true: csr_array, y_pred: csr_array, - metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], + metrics: Sequence[str] = [], ) -> Dict[str, Union[np.float64, float, int]]: y_pred_binary = y_pred > 0.0 @@ -285,7 +246,7 @@ def output_result_per_subject( def results( self, - metrics: Union[Tuple[str, str], Tuple[()], List[str]] = [], + metrics: Sequence[str] = [], results_file: Optional[Union[LazyFile, TextIOWrapper]] = None, language: Optional[str] = None, ) -> Dict[str, Union[np.float64, float]]: From 961dd0943fe2b392b21a4b44c1371c64ee3f3d2b Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 25 May 2023 10:26:58 +0300 Subject: [PATCH 12/28] Remove too wide usage of Any (e.g. in Unions, Lists, Iterators) --- annif/backend/__init__.py | 4 ++-- annif/backend/dummy.py | 4 ++-- annif/backend/mllm.py | 10 +++++----- annif/backend/stwfsa.py | 8 +++----- annif/backend/tfidf.py | 4 ++-- annif/backend/yake.py | 6 ++---- annif/corpus/skos.py | 3 +-- annif/eval.py | 4 ++-- annif/lexical/mllm.py | 20 +++++++------------- annif/openapi/validation.py | 4 ++-- annif/rest.py | 20 ++++++++------------ annif/suggestion.py | 4 ++-- annif/transform/__init__.py | 4 ++-- annif/transform/transform.py | 8 ++++---- 14 files changed, 44 insertions(+), 59 deletions(-) diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index a0418f3b2..cbeeb648e 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -1,7 +1,7 @@ """Registry of backend types for Annif""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, Type +from typing import TYPE_CHECKING, Type if TYPE_CHECKING: from annif.backend.backend import AnnifBackend @@ -114,7 +114,7 @@ def _yake() -> Type[AnnifBackend]: } -def get_backend(backend_id: str) -> Any: +def get_backend(backend_id: str) -> Type[AnnifBackend]: if backend_id in _backend_fns: return _backend_fns[backend_id]() else: diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index fd6465ddb..5249846b9 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -1,7 +1,7 @@ """Dummy backend for testing basic interaction of projects and backends""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Union +from typing import TYPE_CHECKING, Dict, List, Union from annif.suggestion import SubjectSuggestion @@ -26,7 +26,7 @@ def initialize(self, parallel: bool = False) -> None: def _suggest( self, text: str, params: Dict[str, Union[int, str]] - ) -> List[Union[SubjectSuggestion, Any]]: + ) -> List[SubjectSuggestion]: score = float(params.get("score", 1.0)) # Ensure tests fail if "text" with wrong type ends up here diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index b8d530e64..a31537cf2 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Tuple, Union +from typing import TYPE_CHECKING, Dict, Iterator, List, Tuple, Union import joblib import numpy as np @@ -152,14 +152,14 @@ def _train( self.info("saving model") annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE) - def _generate_candidates(self, text: str) -> List[Union[Candidate, Any]]: + def _generate_candidates(self, text: str) -> List[Candidate]: return self._model.generate_candidates(text, self.project.analyzer) def _prediction_to_result( self, - prediction: List[Union[Tuple[np.float64, int], Any]], + prediction: List[Tuple[np.float64, int]], params: Dict[str, Union[float, bool, str]], - ) -> Iterator[Any]: + ) -> Iterator: vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: vector[subject_id] = score @@ -167,7 +167,7 @@ def _prediction_to_result( def _suggest( self, text: str, params: Dict[str, Union[float, bool, str]] - ) -> Iterator[Any]: + ) -> Iterator: candidates = self._generate_candidates(text) prediction = self._model.predict(candidates) return self._prediction_to_result(prediction, params) diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index d8818ca16..6aed7eeb8 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Tuple, Union from stwfsapy.predictor import StwfsapyPredictor @@ -77,9 +77,7 @@ def initialize(self, parallel: bool = False) -> None: f"Model not found at {path}", backend_id=self.backend_id ) - def _load_data( - self, corpus: DocumentCorpus - ) -> Tuple[List[str], List[List[Union[str, Any]]]]: + def _load_data(self, corpus: DocumentCorpus) -> Tuple[List[str], List[List[str]]]: if corpus == "cached": raise NotSupportedException( "Training stwfsa project from cached data not supported." @@ -129,7 +127,7 @@ def _train( def _suggest( self, text: str, params: Dict[str, Union[str, bool, int]] - ) -> List[Union[SubjectSuggestion, Any]]: + ) -> List[SubjectSuggestion]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') result = self._model.suggest_proba([text])[0] suggestions = [] diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index bb566287d..a7a399ce9 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -4,7 +4,7 @@ import os.path import tempfile -from typing import TYPE_CHECKING, Any, Dict, Iterator, Union +from typing import TYPE_CHECKING, Dict, Iterator, Union import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -126,7 +126,7 @@ def _train( veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus) - def _suggest(self, text: str, params: Dict[str, int]) -> Iterator[Any]: + def _suggest(self, text: str, params: Dict[str, int]) -> Iterator: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 4989a4805..b36b6ec1c 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -6,7 +6,7 @@ import os.path import re from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple import joblib import yake @@ -117,9 +117,7 @@ def _sort_phrase(self, phrase: str) -> str: words = phrase.split() return " ".join(sorted(words)) - def _suggest( - self, text: str, params: Dict[str, Any] - ) -> List[Union[SubjectSuggestion, Any]]: + def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params["limit"]) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index c64d591de..6a5fb3f15 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -6,7 +6,6 @@ import shutil from typing import ( TYPE_CHECKING, - Any, DefaultDict, Dict, Iterator, @@ -28,7 +27,7 @@ from rdflib.term import URIRef -def serialize_subjects_to_skos(subjects: Iterator[Any], path: str) -> None: +def serialize_subjects_to_skos(subjects: Iterator, path: str) -> None: """Create a SKOS representation of the given subjects and serialize it into a SKOS/Turtle file with the given path name.""" import joblib diff --git a/annif/eval.py b/annif/eval.py index a41e4d632..57a8e163e 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Sequence, Union import numpy as np import scipy.sparse @@ -88,7 +88,7 @@ def __init__(self, subject_index: SubjectIndex) -> None: def evaluate_many( self, suggestion_batch: Union[ - List[List[SubjectSuggestion]], SuggestionBatch, List[Iterator[Any]] + List[List[SubjectSuggestion]], SuggestionBatch, List[Iterator] ], gold_subject_batch: Sequence[SubjectSet], ) -> None: diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 8df5cfa9b..3c7ef37da 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -5,7 +5,7 @@ import math from enum import IntEnum from statistics import mean -from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, DefaultDict, Dict, List, Tuple, Union import joblib import numpy as np @@ -55,9 +55,7 @@ ) -def conflate_matches( - matches: List[Match], doc_length: int -) -> List[Union[Candidate, Any]]: +def conflate_matches(matches: List[Match], doc_length: int) -> List[Candidate]: subj_matches = collections.defaultdict(list) for match in matches: subj_matches[match.subject_id].append(match) @@ -82,7 +80,7 @@ def generate_candidates( analyzer: Analyzer, vectorizer: CountVectorizer, index: TokenSetIndex, -) -> List[Union[Candidate, Any]]: +) -> List[Candidate]: sentences = analyzer.tokenize_sentences(text) sent_tokens = vectorizer.transform(sentences) matches = [] @@ -152,9 +150,7 @@ def candidates_to_features(cls, candidates): class MLLMModel: """Maui-like Lexical Matching model""" - def generate_candidates( - self, text: str, analyzer: Analyzer - ) -> List[Union[Candidate, Any]]: + def generate_candidates(self, text: str, analyzer: Analyzer) -> List[Candidate]: return generate_candidates(text, analyzer, self._vectorizer, self._index) @property @@ -246,7 +242,7 @@ def _prepare_train_index( def _prepare_train_data( self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int - ) -> Tuple[List[List[Union[Candidate, Any]]], List[bool]]: + ) -> Tuple[List[List[Candidate]], List[bool]]: # frequency of subjects (by id) in the generated candidates self._doc_freq = collections.Counter() # frequency of manually assigned subjects ("domain keyphraseness") @@ -286,7 +282,7 @@ def _calculate_idf( return idf def _prepare_features( - self, train_x: List[List[Union[Candidate, Any]]], n_jobs: int + self, train_x: List[List[Candidate]], n_jobs: int ) -> List[np.ndarray]: fc_args = {"mdata": self._model_data} jobs, pool_class = annif.parallel.get_pool(n_jobs) @@ -357,9 +353,7 @@ def _prediction_to_list( subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)] return sorted(subj_scores, reverse=True) - def predict( - self, candidates: List[Union[Candidate, Any]] - ) -> List[Tuple[np.float64, int]]: + def predict(self, candidates: List[Candidate]) -> List[Tuple[np.float64, int]]: if not candidates: return [] features = self._candidates_to_features(candidates) diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py index 0b2caec47..9a5d8c586 100644 --- a/annif/openapi/validation.py +++ b/annif/openapi/validation.py @@ -2,7 +2,7 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union import jsonschema from connexion import decorators @@ -24,7 +24,7 @@ def validate_schema( data: Union[ List[Dict[str, Union[List[Dict[str, str]], str]]], List[Dict[str, Optional[List[bool]]]], - Dict[str, List[Any]], + Dict[str, List], Dict[str, str], Dict[str, List[Dict[str, str]]], ], diff --git a/annif/rest.py b/annif/rest.py index 7296727e5..d2f58caae 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import connexion @@ -100,8 +100,7 @@ def _hit_sets_to_list( hit_sets: SuggestionResults, subjects: SubjectIndex, lang: str ) -> List[ Union[ - Dict[str, List[Any]], - Any, + Dict[str, List], Dict[str, List[Dict[str, Union[str, float]]]], Dict[str, List[Dict[str, Optional[Union[str, float]]]]], ] @@ -114,7 +113,7 @@ def _hit_sets_to_list( def _is_error( result: Union[ - List[Dict[str, List[Any]]], + List[Dict[str, List]], List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], List[Dict[str, List[Dict[str, Union[str, float]]]]], ConnexionResponse, @@ -129,7 +128,7 @@ def _is_error( def suggest( project_id: str, body: Dict[str, Union[float, str]] ) -> Union[ - Dict[str, List[Any]], + Dict[str, List], Dict[str, List[Dict[str, Optional[Union[str, float]]]]], ConnexionResponse, Dict[str, List[Dict[str, Union[str, float]]]], @@ -150,7 +149,7 @@ def suggest( def suggest_batch( project_id: str, - body: Dict[str, Union[List[Any], List[Dict[str, str]]]], + body: Dict[str, Union[List, List[Dict[str, str]]]], **query_parameters, ) -> Union[ List[Dict[str, None]], @@ -173,10 +172,10 @@ def suggest_batch( def _suggest( project_id: str, - documents: List[Union[Dict[str, str], Any]], + documents: List[Dict[str, str]], parameters: Dict[str, Union[float, str]], ) -> Union[ - List[Dict[str, List[Any]]], + List[Dict[str, List]], List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], List[Dict[str, List[Dict[str, Union[str, float]]]]], ConnexionResponse, @@ -207,9 +206,7 @@ def _suggest( def _documents_to_corpus( - documents: List[ - Union[Dict[str, str], Dict[str, Union[List[Dict[str, str]], str]], Any] - ], + documents: List[Union[Dict[str, str], Dict[str, Union[List[Dict[str, str]], str]]]], subject_index: Optional[SubjectIndex], ) -> annif.corpus.document.DocumentList: if subject_index is not None: @@ -235,7 +232,6 @@ def learn( body: List[ Union[ Dict[str, Union[List[Dict[str, str]], str]], - Any, Dict[str, Optional[List[bool]]], ] ], diff --git a/annif/suggestion.py b/annif/suggestion.py index 14bf0a699..eb80d8888 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -3,7 +3,7 @@ import collections import itertools -from typing import TYPE_CHECKING, Any, Iterator, List, Optional +from typing import TYPE_CHECKING, Iterator, List, Optional import numpy as np from scipy.sparse import csr_array @@ -14,7 +14,7 @@ SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score") -def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator[Any]: +def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator: limit = min(len(vector), limit) topk_idx = np.argpartition(vector, -limit)[-limit:] return ( diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index d25af5b3b..46b30b920 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -19,10 +19,10 @@ def parse_specs( transform_specs: str, ) -> List[ Union[ - Tuple[str, List[Any], Dict[Any, Any]], + Tuple[str, List, Dict[Any, Any]], Tuple[str, List[str], Dict[str, str]], Tuple[str, List[str], Dict[Any, Any]], - Tuple[str, List[Any], Dict[str, str]], + Tuple[str, List, Dict[str, str]], ] ]: """Parse a transformation specification into a list of tuples, e.g. diff --git a/annif/transform/transform.py b/annif/transform/transform.py index caa4fc9c0..e19f0814d 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -45,9 +45,9 @@ def __init__( transform_classes: List[Type[BaseTransform]], args: List[ Union[ - Tuple[List[Any], Dict[str, str]], + Tuple[List, Dict[str, str]], Tuple[List[str], Dict[Any, Any]], - Tuple[List[Any], Dict[Any, Any]], + Tuple[List, Dict[Any, Any]], ] ], project: Optional[AnnifProject], @@ -60,9 +60,9 @@ def _init_transforms( transform_classes: List[Type[BaseTransform]], args: List[ Union[ - Tuple[List[Any], Dict[str, str]], + Tuple[List, Dict[str, str]], Tuple[List[str], Dict[Any, Any]], - Tuple[List[Any], Dict[Any, Any]], + Tuple[List, Dict[Any, Any]], ] ], ) -> List[Type[BaseTransform]]: From 33cdcf2a53c1d7bd0d74da2c59ca53369bc4aff1 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 25 May 2023 10:57:14 +0300 Subject: [PATCH 13/28] Unify type of params to Dict[str, Any] or DefaultDict[str, Dict --- annif/backend/backend.py | 16 ++++++++-------- annif/backend/dummy.py | 8 +++----- annif/backend/ensemble.py | 10 ++++------ annif/backend/fasttext.py | 10 +++++----- annif/backend/http.py | 6 ++---- annif/backend/mllm.py | 12 +++++------- annif/backend/nn_ensemble.py | 10 +++++----- annif/backend/omikuji.py | 10 +++++----- annif/backend/pav.py | 8 ++++---- annif/backend/stwfsa.py | 8 +++----- annif/backend/svc.py | 10 +++++----- annif/backend/tfidf.py | 6 +++--- annif/lexical/mllm.py | 18 +++++++----------- annif/project.py | 10 +++++----- 14 files changed, 64 insertions(+), 78 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index ea18479db..f69e1f55f 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -5,7 +5,7 @@ import os.path from datetime import datetime, timezone from glob import glob -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional from annif import logger from annif.suggestion import SuggestionBatch @@ -24,7 +24,7 @@ class AnnifBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"limit": 100} def __init__( - self, backend_id: str, config_params: Any, project: AnnifProject + self, backend_id: str, config_params: Dict[str, Any], project: AnnifProject ) -> None: """Initialize backend with specific parameters. The parameters are a dict. Keys and values depend on the specific @@ -34,7 +34,7 @@ def __init__( self.project = project self.datadir = project.datadir - def default_params(self) -> Dict[str, Union[str, bool, int]]: + def default_params(self) -> Dict[str, Any]: return self.DEFAULT_PARAMETERS @property @@ -61,7 +61,7 @@ def modification_time(self) -> Optional[datetime.datetime]: def _get_backend_params( self, - params: Optional[Union[Dict[str, str], Dict[str, int], Dict[str, float]]], + params: Optional[Dict[str, Any]], ) -> Dict[str, Any]: backend_params = dict(self.params) if params is not None: @@ -71,7 +71,7 @@ def _get_backend_params( def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, str]], + params: Dict[str, Any], jobs: int = 0, ) -> None: """This method can be overridden by backends. It implements @@ -81,7 +81,7 @@ def _train( def train( self, corpus: DocumentCorpus, - params: Optional[Union[Dict[str, float], Dict[str, int]]] = None, + params: Optional[Dict[str, Any]] = None, jobs: int = 0, ) -> None: """Train the model on the given document or subject corpus.""" @@ -116,7 +116,7 @@ def _suggest_batch( def suggest( self, texts: List[str], - params: Optional[Union[Dict[str, str], Dict[str, int]]] = None, + params: Optional[Dict[str, Any]] = None, ) -> SuggestionBatch: """Suggest subjects for the input documents and return a list of subject sets represented as a list of SubjectSuggestion objects.""" @@ -149,7 +149,7 @@ def _learn(self, corpus, params): def learn( self, corpus: DocumentCorpus, - params: Optional[Dict[str, int]] = None, + params: Optional[Dict[str, Any]] = None, ) -> None: """Further train the model on the given document or subject corpus.""" beparams = self._get_backend_params(params) diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index 5249846b9..b7a0fd357 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -1,7 +1,7 @@ """Dummy backend for testing basic interaction of projects and backends""" from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Union +from typing import TYPE_CHECKING, Any, Dict, List from annif.suggestion import SubjectSuggestion @@ -24,9 +24,7 @@ def default_params(self) -> Dict[str, int]: def initialize(self, parallel: bool = False) -> None: self.initialized = True - def _suggest( - self, text: str, params: Dict[str, Union[int, str]] - ) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: score = float(params.get("score", 1.0)) # Ensure tests fail if "text" with wrong type ends up here @@ -47,7 +45,7 @@ def _suggest( def _learn( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, str]], + params: Dict[str, Any], ) -> None: # in this dummy backend we "learn" by picking up the subject ID # of the first subject of the first document in the learning set diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index 08dac5b2a..a8a93d833 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -1,7 +1,7 @@ """Ensemble backend that combines results from multiple projects""" from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import annif.eval import annif.parallel @@ -49,7 +49,7 @@ def _merge_source_batches( self, batch_by_source: Dict[str, SuggestionBatch], sources: List[Tuple[str, float]], - params: Dict[str, Union[int, str]], + params: Dict[str, Any], ) -> SuggestionBatch: """Merge the given SuggestionBatches from each source into a single SuggestionBatch. The default implementation computes a weighted @@ -63,7 +63,7 @@ def _merge_source_batches( ) def _suggest_batch( - self, texts: List[str], params: Dict[str, Union[float, str]] + self, texts: List[str], params: Dict[str, Any] ) -> SuggestionBatch: sources = annif.util.parse_sources(params["sources"]) batch_by_source = self._suggest_with_sources(texts, sources) @@ -159,7 +159,5 @@ def get_hp_optimizer( ) -> EnsembleOptimizer: return EnsembleOptimizer(self, corpus, metric) - def _train( - self, corpus: DocumentCorpus, params: Dict[str, Union[int, str]], jobs: int = 0 - ): + def _train(self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0): raise NotSupportedException("Training ensemble backend is not possible.") diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index 740162bc8..fd5ed770b 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -3,7 +3,7 @@ import collections import os.path -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple import fasttext @@ -56,7 +56,7 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Union[float, str]]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(mixins.ChunkingBackend.DEFAULT_PARAMETERS) params.update(self.DEFAULT_PARAMETERS) @@ -119,7 +119,7 @@ def _create_train_file( corpus, self.datadir, self.TRAIN_FILE, method=self._write_train_file ) - def _create_model(self, params: Dict[str, Union[float, str]], jobs: int) -> None: + def _create_model(self, params: Dict[str, Any], jobs: int) -> None: self.info("creating fastText model") trainpath = os.path.join(self.datadir, self.TRAIN_FILE) modelpath = os.path.join(self.datadir, self.MODEL_FILE) @@ -137,7 +137,7 @@ def _create_model(self, params: Dict[str, Union[float, str]], jobs: int) -> None def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[float, str]], + params: Dict[str, Any], jobs: int = 0, ) -> None: if corpus != "cached": @@ -163,7 +163,7 @@ def _predict_chunks( ) def _suggest_chunks( - self, chunktexts: List[str], params: Dict[str, Union[float, str]] + self, chunktexts: List[str], params: Dict[str, Any] ) -> List[SubjectSuggestion]: limit = int(params["limit"]) chunklabels, chunkscores = self._predict_chunks(chunktexts, limit) diff --git a/annif/backend/http.py b/annif/backend/http.py index 59a729b89..85298bbee 100644 --- a/annif/backend/http.py +++ b/annif/backend/http.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import dateutil.parser import requests @@ -63,9 +63,7 @@ def _get_project_info(self, key: str) -> Optional[Union[bool, str]]: else: return None - def _suggest( - self, text: str, params: Dict[str, Union[int, str]] - ) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: data = {"text": text} if "project" in params: data["project"] = params["project"] diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index a31537cf2..138f98282 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Dict, Iterator, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Tuple import joblib import numpy as np @@ -93,7 +93,7 @@ class MLLMBackend(hyperopt.AnnifHyperoptBackend): def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> MLLMOptimizer: return MLLMOptimizer(self, corpus, metric) - def default_params(self) -> Dict[str, Union[float, bool]]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -124,7 +124,7 @@ def initialize(self, parallel: bool = False) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], jobs: int = 0, ) -> None: self.info("starting train") @@ -158,16 +158,14 @@ def _generate_candidates(self, text: str) -> List[Candidate]: def _prediction_to_result( self, prediction: List[Tuple[np.float64, int]], - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], ) -> Iterator: vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: vector[subject_id] = score return vector_to_suggestions(vector, int(params["limit"])) - def _suggest( - self, text: str, params: Dict[str, Union[float, bool, str]] - ) -> Iterator: + def _suggest(self, text: str, params: Dict[str, Any]) -> Iterator: candidates = self._generate_candidates(text) prediction = self._model.predict(candidates) return self._prediction_to_result(prediction, params) diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 29f2add94..5fb82fb28 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -5,7 +5,7 @@ import os.path import shutil from io import BytesIO -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union import joblib import lmdb @@ -112,7 +112,7 @@ class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBacke # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Union[float, str]]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -140,7 +140,7 @@ def _merge_source_batches( self, batch_by_source: Dict[str, SuggestionBatch], sources: List[Tuple[str, float]], - params: Dict[str, Union[float, str]], + params: Dict[str, Any], ) -> SuggestionBatch: src_weight = dict(sources) score_vectors = np.array( @@ -199,7 +199,7 @@ def _create_model(self, sources: List[Tuple[str, float]]) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[float, str]], + params: Dict[str, Any], jobs: int = 0, ) -> None: sources = annif.util.parse_sources(self.params["sources"]) @@ -286,7 +286,7 @@ def _fit_model( def _learn( self, corpus: DocumentCorpus, - params: Dict[str, Union[float, str]], + params: Dict[str, Any], ) -> None: self.initialize() self._fit_model( diff --git a/annif/backend/omikuji.py b/annif/backend/omikuji.py index e60467d85..7c47c1b8a 100644 --- a/annif/backend/omikuji.py +++ b/annif/backend/omikuji.py @@ -3,7 +3,7 @@ import os.path import shutil -from typing import TYPE_CHECKING, Dict, List, Union +from typing import TYPE_CHECKING, Any, Dict, List import omikuji @@ -43,7 +43,7 @@ class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): "collapse_every_n_layers": 0, } - def default_params(self) -> Dict[str, Union[int, bool]]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -96,7 +96,7 @@ def _create_train_file(self, veccorpus: csr_matrix, corpus: DocumentCorpus) -> N trainfile.seek(0) print("{:08d}".format(n_samples), end="", file=trainfile) - def _create_model(self, params: Dict[str, Union[int, bool]], jobs: int) -> None: + def _create_model(self, params: Dict[str, Any], jobs: int) -> None: train_path = os.path.join(self.datadir, self.TRAIN_FILE) model_path = os.path.join(self.datadir, self.MODEL_FILE) hyper_param = omikuji.Model.default_hyper_param() @@ -114,7 +114,7 @@ def _create_model(self, params: Dict[str, Union[int, bool]], jobs: int) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, bool]], + params: Dict[str, Any], jobs: int = 0, ) -> None: if corpus != "cached": @@ -135,7 +135,7 @@ def _train( self._create_model(params, jobs) def _suggest_batch( - self, texts: List[str], params: Dict[str, Union[int, bool]] + self, texts: List[str], params: Dict[str, Any] ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) limit = int(params["limit"]) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index dba85f62e..fc0571274 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -5,7 +5,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple import joblib import numpy as np @@ -36,7 +36,7 @@ class PAVBackend(ensemble.BaseEnsembleBackend): DEFAULT_PARAMETERS = {"min-docs": 10} - def default_params(self) -> Dict[str, int]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -67,7 +67,7 @@ def _merge_source_batches( self, batch_by_source: Dict[str, SuggestionBatch], sources: List[Tuple[str, float]], - params: Dict[str, Union[int, str]], + params: Dict[str, Any], ) -> SuggestionBatch: reg_batch_by_source = {} for project_id, batch in batch_by_source.items(): @@ -156,7 +156,7 @@ def _create_pav_model( def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[int, str]], + params: Dict[str, Any], jobs: int = 0, ) -> None: if corpus == "cached": diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index 6aed7eeb8..69e93d770 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Tuple from stwfsapy.predictor import StwfsapyPredictor @@ -102,7 +102,7 @@ def _load_data(self, corpus: DocumentCorpus) -> Tuple[List[str], List[List[str]] def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[str, bool, int]], + params: Dict[str, Any], jobs: int = 0, ) -> None: X, y = self._load_data(corpus) @@ -125,9 +125,7 @@ def _train( lambda model, store_path: model.store(store_path), ) - def _suggest( - self, text: str, params: Dict[str, Union[str, bool, int]] - ) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') result = self._model.suggest_proba([text])[0] suggestions = [] diff --git a/annif/backend/svc.py b/annif/backend/svc.py index 34f989a6e..257afeca8 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Dict, List, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Tuple import joblib import numpy as np @@ -33,7 +33,7 @@ class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): DEFAULT_PARAMETERS = {"min_df": 1, "ngram": 1} - def default_params(self) -> Dict[str, int]: + def default_params(self) -> Dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -79,7 +79,7 @@ def _train_classifier(self, veccorpus: csr_matrix, classes: List[int]) -> None: ) def _train( - self, corpus: DocumentCorpus, params: Dict[str, int], jobs: int = 0 + self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0 ) -> None: if corpus == "cached": raise NotSupportedException( @@ -97,7 +97,7 @@ def _train( self._train_classifier(veccorpus, classes) def _scores_to_suggestions( - self, scores: np.ndarray, params: Dict[str, int] + self, scores: np.ndarray, params: Dict[str, Any] ) -> List[SubjectSuggestion]: results = [] limit = int(params["limit"]) @@ -110,7 +110,7 @@ def _scores_to_suggestions( return results def _suggest_batch( - self, texts: List[str], params: Dict[str, int] + self, texts: List[str], params: Dict[str, Any] ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) confidences = self._model.decision_function(vector) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index a7a399ce9..89ddbdcf2 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -4,7 +4,7 @@ import os.path import tempfile -from typing import TYPE_CHECKING, Dict, Iterator, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -112,7 +112,7 @@ def _create_index(self, veccorpus: csr_matrix) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Union[str, int]], + params: Dict[str, Any], jobs: int = 0, ) -> None: if corpus == "cached": @@ -126,7 +126,7 @@ def _train( veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus) - def _suggest(self, text: str, params: Dict[str, int]) -> Iterator: + def _suggest(self, text: str, params: Dict[str, Any]) -> Iterator: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 3c7ef37da..36ba662de 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -5,7 +5,7 @@ import math from enum import IntEnum from statistics import mean -from typing import TYPE_CHECKING, DefaultDict, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Tuple, Union import joblib import numpy as np @@ -169,9 +169,7 @@ def _candidates_to_features(self, candidates: List[Candidate]) -> np.ndarray: return candidates_to_features(candidates, self._model_data) @staticmethod - def _get_label_props( - params: Dict[str, Union[float, bool, str]] - ) -> Tuple[List[URIRef], List[URIRef]]: + def _get_label_props(params: Dict[str, Any]) -> Tuple[List[URIRef], List[URIRef]]: pref_label_props = [SKOS.prefLabel] if annif.util.boolean(params["use_hidden_labels"]): @@ -185,7 +183,7 @@ def _prepare_terms( self, graph: Graph, vocab: AnnifVocabulary, - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], ) -> Tuple[List[Term], List[int]]: pref_label_props, nonpref_label_props = self._get_label_props(params) @@ -216,7 +214,7 @@ def _prepare_train_index( self, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], ) -> List[int]: graph = vocab.as_graph() terms, subject_ids = self._prepare_terms(graph, vocab, params) @@ -301,7 +299,7 @@ def prepare_train( corpus: DocumentCorpus, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], n_jobs: int, ) -> Tuple[np.ndarray, np.ndarray]: # create an index from the vocabulary terms @@ -318,9 +316,7 @@ def prepare_train( return (np.vstack(features), np.array(train_y)) - def _create_classifier( - self, params: Dict[str, Union[float, bool, str]] - ) -> BaggingClassifier: + def _create_classifier(self, params: Dict[str, Any]) -> BaggingClassifier: return BaggingClassifier( DecisionTreeClassifier( min_samples_leaf=int(params["min_samples_leaf"]), @@ -333,7 +329,7 @@ def train( self, train_x: Union[np.ndarray, List[Tuple[int, int]]], train_y: Union[List[bool], np.ndarray], - params: Dict[str, Union[float, bool, str]], + params: Dict[str, Any], ) -> None: # fit the model on the training corpus self._classifier = self._create_classifier(params) diff --git a/annif/project.py b/annif/project.py index 722e46c9b..99a3096d0 100644 --- a/annif/project.py +++ b/annif/project.py @@ -134,7 +134,7 @@ def initialize(self, parallel: bool = False) -> None: def _suggest_with_backend( self, texts: List[str], - backend_params: Optional[DefaultDict[str, Dict[str, str]]], + backend_params: Optional[DefaultDict[str, Dict]], ) -> annif.suggestion.SuggestionBatch: if backend_params is None: backend_params = {} @@ -226,7 +226,7 @@ def modification_time(self) -> Optional[datetime]: def suggest_corpus( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, + backend_params: Optional[DefaultDict[str, Dict]] = None, ) -> annif.suggestion.SuggestionResults: """Suggest subjects for the given documents corpus in batches of documents.""" suggestions = ( @@ -239,7 +239,7 @@ def suggest_corpus( def suggest( self, texts: List[str], - backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, + backend_params: Optional[DefaultDict[str, Dict]] = None, ) -> annif.suggestion.SuggestionBatch: """Suggest subjects for the given documents batch.""" if not self.is_trained: @@ -253,7 +253,7 @@ def suggest( def train( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, + backend_params: Optional[DefaultDict[str, Dict]] = None, jobs: int = 0, ) -> None: """train the project using documents from a metadata source""" @@ -267,7 +267,7 @@ def train( def learn( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict[str, str]]] = None, + backend_params: Optional[DefaultDict[str, Dict]] = None, ) -> None: """further train the project using documents from a metadata source""" if backend_params is None: From bb9951fbaa7a024841e6865f2d6b5f27bc028619 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 25 May 2023 12:11:24 +0300 Subject: [PATCH 14/28] Simplify overly complex types --- annif/cli_util.py | 6 ++--- annif/corpus/subject.py | 2 +- annif/lexical/tokenset.py | 4 +-- annif/openapi/validation.py | 10 ++------ annif/project.py | 2 +- annif/rest.py | 48 ++++++++---------------------------- annif/transform/__init__.py | 11 ++------- annif/transform/transform.py | 18 +++----------- 8 files changed, 23 insertions(+), 78 deletions(-) diff --git a/annif/cli_util.py b/annif/cli_util.py index be1c07690..6dad84790 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -128,7 +128,7 @@ def format_datetime(dt: Optional[datetime]) -> str: def open_documents( - paths: Union[Tuple[str], Tuple[str, str], Tuple[()]], + paths: Union[Tuple[str, ...], Tuple[()]], subject_index: SubjectIndex, vocab_lang: str, docs_limit: Optional[int], @@ -161,7 +161,7 @@ def open_doc_path(path, subject_index): def open_text_documents( - paths: Union[Tuple[str], Tuple[str, str]], docs_limit: Optional[int] + paths: Tuple[str, ...], docs_limit: Optional[int] ) -> DocumentList: """ Helper function to read text documents from the given file paths. Returns a @@ -205,7 +205,7 @@ def show_hits( def parse_backend_params( - backend_param: Union[Tuple[str], Tuple[()]], project: AnnifProject + backend_param: Union[Tuple[str, ...], Tuple[()]], project: AnnifProject ) -> DefaultDict[str, Dict[str, str]]: """Parse a list of backend parameters given with the --backend-param option into a nested dict structure""" diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 49cb88f7d..0ef5d2b73 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -260,7 +260,7 @@ def from_string( @staticmethod def _parse_line( line: str, - ) -> Union[Tuple[None, None], Tuple[str, str], Tuple[None, str]]: + ) -> Tuple[Optional[str], Optional[str]]: uri = label = None vals = line.split("\t") for val in vals: diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index 42a11a4d4..570ea1204 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -51,9 +51,7 @@ def add(self, tset: TokenSet) -> None: if tset.key is not None: self._index[tset.key].add(tset) - def _find_subj_tsets( - self, tset: TokenSet - ) -> Union[Dict[Optional[int], TokenSet], Dict[int, TokenSet]]: + def _find_subj_tsets(self, tset: TokenSet) -> Dict[Optional[int], TokenSet]: """return a dict (subject_id : TokenSet) of matches contained in the given TokenSet""" diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py index 9a5d8c586..432c7fa59 100644 --- a/annif/openapi/validation.py +++ b/annif/openapi/validation.py @@ -2,7 +2,7 @@ from __future__ import annotations import logging -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union import jsonschema from connexion import decorators @@ -21,13 +21,7 @@ def __init__(self, *args, **kwargs) -> None: def validate_schema( self, - data: Union[ - List[Dict[str, Union[List[Dict[str, str]], str]]], - List[Dict[str, Optional[List[bool]]]], - Dict[str, List], - Dict[str, str], - Dict[str, List[Dict[str, str]]], - ], + data: Union[List, Dict], url: str, ) -> None: """Validate the request body against the schema.""" diff --git a/annif/project.py b/annif/project.py index 99a3096d0..408c1af30 100644 --- a/annif/project.py +++ b/annif/project.py @@ -300,7 +300,7 @@ def hyperopt( project_id=self.project_id, ) - def dump(self) -> Dict[str, Optional[Union[str, Dict[str, str], bool, datetime]]]: + def dump(self) -> Dict[str, Optional[Union[str, Dict, bool, datetime]]]: """return this project as a dict""" return { "project_id": self.project_id, diff --git a/annif/rest.py b/annif/rest.py index d2f58caae..f7af6dc67 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import connexion @@ -60,7 +60,7 @@ def language_not_supported_error(lang: str) -> ConnexionResponse: def list_projects() -> ( - Dict[str, List[Dict[str, Optional[Union[str, Dict[str, str], bool, datetime]]]]] + Dict[str, List[Dict[str, Optional[Union[str, Dict, bool, datetime]]]]] ): """return a dict with projects formatted according to OpenAPI spec""" @@ -74,7 +74,7 @@ def list_projects() -> ( def show_project( project_id: str, -) -> Union[Dict[str, Optional[Union[str, Dict[str, str], bool]]], ConnexionResponse]: +) -> Union[Dict, ConnexionResponse]: """return a single project formatted according to OpenAPI spec""" try: @@ -98,13 +98,7 @@ def _suggestion_to_dict( def _hit_sets_to_list( hit_sets: SuggestionResults, subjects: SubjectIndex, lang: str -) -> List[ - Union[ - Dict[str, List], - Dict[str, List[Dict[str, Union[str, float]]]], - Dict[str, List[Dict[str, Optional[Union[str, float]]]]], - ] -]: +) -> List[Dict[str, List]]: return [ {"results": [_suggestion_to_dict(hit, subjects, lang) for hit in hits]} for hits in hit_sets @@ -114,8 +108,6 @@ def _hit_sets_to_list( def _is_error( result: Union[ List[Dict[str, List]], - List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], - List[Dict[str, List[Dict[str, Union[str, float]]]]], ConnexionResponse, ] ) -> bool: @@ -127,12 +119,7 @@ def _is_error( def suggest( project_id: str, body: Dict[str, Union[float, str]] -) -> Union[ - Dict[str, List], - Dict[str, List[Dict[str, Optional[Union[str, float]]]]], - ConnexionResponse, - Dict[str, List[Dict[str, Union[str, float]]]], -]: +) -> Union[Dict[str, List], ConnexionResponse]: """suggest subjects for the given text and return a dict with results formatted according to OpenAPI spec""" @@ -149,14 +136,9 @@ def suggest( def suggest_batch( project_id: str, - body: Dict[str, Union[List, List[Dict[str, str]]]], + body: Dict[str, List], **query_parameters, -) -> Union[ - List[Dict[str, None]], - List[Dict[str, Optional[List[Dict[str, Optional[Union[str, float]]]]]]], - List[Dict[str, Union[List[Dict[str, Optional[Union[str, float]]]], str]]], - ConnexionResponse, -]: +) -> Union[List[Dict[str, Any]], ConnexionResponse]: """suggest subjects for the given documents and return a list of dicts with results formatted according to OpenAPI spec""" @@ -174,12 +156,7 @@ def _suggest( project_id: str, documents: List[Dict[str, str]], parameters: Dict[str, Union[float, str]], -) -> Union[ - List[Dict[str, List]], - List[Dict[str, List[Dict[str, Optional[Union[str, float]]]]]], - List[Dict[str, List[Dict[str, Union[str, float]]]]], - ConnexionResponse, -]: +) -> Union[List[Dict[str, List]], ConnexionResponse]: corpus = _documents_to_corpus(documents, subject_index=None) try: project = annif.registry.get_project(project_id, min_access=Access.hidden) @@ -206,7 +183,7 @@ def _suggest( def _documents_to_corpus( - documents: List[Union[Dict[str, str], Dict[str, Union[List[Dict[str, str]], str]]]], + documents: List[Dict[str, Any]], subject_index: Optional[SubjectIndex], ) -> annif.corpus.document.DocumentList: if subject_index is not None: @@ -229,12 +206,7 @@ def _documents_to_corpus( def learn( project_id: str, - body: List[ - Union[ - Dict[str, Union[List[Dict[str, str]], str]], - Dict[str, Optional[List[bool]]], - ] - ], + body: List[Dict[str, Any]], ) -> Union[ConnexionResponse, Tuple[None, int]]: """learn from documents and return an empty 204 response if succesful""" diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index 46b30b920..e88dbe75e 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -2,7 +2,7 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple import annif from annif.exception import ConfigurationException @@ -17,14 +17,7 @@ def parse_specs( transform_specs: str, -) -> List[ - Union[ - Tuple[str, List, Dict[Any, Any]], - Tuple[str, List[str], Dict[str, str]], - Tuple[str, List[str], Dict[Any, Any]], - Tuple[str, List, Dict[str, str]], - ] -]: +) -> List[Tuple[str, List, Dict]]: """Parse a transformation specification into a list of tuples, e.g. 'transf_1(x),transf_2(y=42),transf_3' is parsed to [(transf_1, [x], {}), (transf_2, [], {y: 42}), (transf_3, [], {})].""" diff --git a/annif/transform/transform.py b/annif/transform/transform.py index e19f0814d..a4ba227d7 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -2,7 +2,7 @@ from __future__ import annotations import abc -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type from annif.corpus import TransformingDocumentCorpus from annif.exception import ConfigurationException @@ -43,13 +43,7 @@ class TransformChain: def __init__( self, transform_classes: List[Type[BaseTransform]], - args: List[ - Union[ - Tuple[List, Dict[str, str]], - Tuple[List[str], Dict[Any, Any]], - Tuple[List, Dict[Any, Any]], - ] - ], + args: List[Tuple[List, Dict]], project: Optional[AnnifProject], ) -> None: self.project = project @@ -58,13 +52,7 @@ def __init__( def _init_transforms( self, transform_classes: List[Type[BaseTransform]], - args: List[ - Union[ - Tuple[List, Dict[str, str]], - Tuple[List[str], Dict[Any, Any]], - Tuple[List, Dict[Any, Any]], - ] - ], + args: List[Tuple[List, Dict]], ) -> List[Type[BaseTransform]]: transforms = [] for trans, (posargs, kwargs) in zip(transform_classes, args): From c405d8346e5b02e22b02ba084d94f6c459c4437e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 25 May 2023 14:25:29 +0300 Subject: [PATCH 15/28] Fix erronously passing whole Error obj to ClickException instead of just msg --- annif/config.py | 2 +- annif/exception.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/annif/config.py b/annif/config.py index cad59e734..2a4f0ac9a 100644 --- a/annif/config.py +++ b/annif/config.py @@ -29,7 +29,7 @@ def __init__(self, filename: str) -> None: configparser.DuplicateOptionError, configparser.DuplicateSectionError, ) as err: - raise ConfigurationException(err) + raise ConfigurationException(err.message) @property def project_ids(self) -> List[str]: diff --git a/annif/exception.py b/annif/exception.py index 64d1daaf6..9f497324d 100644 --- a/annif/exception.py +++ b/annif/exception.py @@ -1,13 +1,10 @@ """Custom exceptions used by Annif""" from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional from click import ClickException -if TYPE_CHECKING: - from configparser import DuplicateSectionError - class AnnifException(ClickException): """Base Annif exception. We define this as a subclass of ClickException so @@ -16,7 +13,7 @@ class AnnifException(ClickException): def __init__( self, - message: Union[DuplicateSectionError, str], + message: str, project_id: Optional[str] = None, backend_id: Optional[str] = None, ) -> None: From b74d869c7034303e829548144a4b22844fdb376a Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 25 May 2023 15:50:54 +0300 Subject: [PATCH 16/28] Annotate (manually) annif/backend/hyperopt.py --- annif/backend/hyperopt.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/annif/backend/hyperopt.py b/annif/backend/hyperopt.py index 1bdce0aa4..d05841e2f 100644 --- a/annif/backend/hyperopt.py +++ b/annif/backend/hyperopt.py @@ -1,14 +1,23 @@ """Hyperparameter optimization functionality for backends""" +from __future__ import annotations import abc import collections import warnings +from typing import TYPE_CHECKING, Callable, Dict, Optional import optuna import optuna.exceptions from .backend import AnnifBackend +if TYPE_CHECKING: + from click.utils import LazyFile + from optuna.study.study import Study + from optuna.trial import Trial + + from annif.corpus.document import DocumentCorpus + HPRecommendation = collections.namedtuple("HPRecommendation", "lines score") @@ -16,12 +25,12 @@ class TrialWriter: """Object that writes hyperparameter optimization trial results into a TSV file.""" - def __init__(self, results_file, normalize_func): + def __init__(self, results_file: LazyFile, normalize_func: Callable) -> None: self.results_file = results_file self.normalize_func = normalize_func self.header_written = False - def write(self, study, trial): + def write(self, study: Study, trial: Trial) -> None: """Write the results of one trial into the results file. On the first run, write the header line first.""" @@ -44,12 +53,14 @@ def write(self, study, trial): class HyperparameterOptimizer: """Base class for hyperparameter optimizers""" - def __init__(self, backend, corpus, metric): + def __init__( + self, backend: AnnifBackend, corpus: DocumentCorpus, metric: str + ) -> None: self._backend = backend self._corpus = corpus self._metric = metric - def _prepare(self, n_jobs=1): + def _prepare(self, n_jobs: int = 1): """Prepare the optimizer for hyperparameter evaluation. Up to n_jobs parallel threads or processes may be used during the operation.""" @@ -57,21 +68,23 @@ def _prepare(self, n_jobs=1): pass # pragma: no cover @abc.abstractmethod - def _objective(self, trial): + def _objective(self, trial: Trial) -> float: """Objective function to optimize""" pass # pragma: no cover @abc.abstractmethod - def _postprocess(self, study): + def _postprocess(self, study: Study) -> HPRecommendation: """Convert the study results into hyperparameter recommendations""" pass # pragma: no cover - def _normalize(self, hps): + def _normalize(self, hps: Dict[str, float]) -> Dict[str, float]: """Normalize the given raw hyperparameters. Intended to be overridden by subclasses when necessary. The default is to keep them as-is.""" return hps - def optimize(self, n_trials, n_jobs, results_file): + def optimize( + self, n_trials: int, n_jobs: int, results_file: Optional[LazyFile] + ) -> HPRecommendation: """Find the optimal hyperparameters by testing up to the given number of hyperparameter combinations""" @@ -103,7 +116,7 @@ class AnnifHyperoptBackend(AnnifBackend): optimization""" @abc.abstractmethod - def get_hp_optimizer(self, corpus, metric): + def get_hp_optimizer(self, corpus: DocumentCorpus): """Get a HyperparameterOptimizer object that can look for optimal hyperparameter combinations for the given corpus, measured using the given metric""" From 4eb904e884bcfd829d5d08a7c92fa2f825abe308 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 26 May 2023 10:08:58 +0300 Subject: [PATCH 17/28] Manually annotate annif/backend/mixins.py --- annif/backend/mixins.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 5161a947d..525dc18e4 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -1,8 +1,9 @@ """Annif backend mixins that can be used to implement features""" - +from __future__ import annotations import abc import os.path +from typing import TYPE_CHECKING, Any, Dict, Iterator, List import joblib from sklearn.feature_extraction.text import TfidfVectorizer @@ -10,23 +11,30 @@ import annif.util from annif.exception import NotInitializedException +if TYPE_CHECKING: + from scipy.sparse._csr import csr_matrix + + from annif.suggestion import SubjectSuggestion + class ChunkingBackend(metaclass=abc.ABCMeta): """Annif backend mixin that implements chunking of input""" DEFAULT_PARAMETERS = {"chunksize": 1} - def default_params(self): + def default_params(self) -> Dict[str, Any]: return self.DEFAULT_PARAMETERS @abc.abstractmethod - def _suggest_chunks(self, chunktexts, params): + def _suggest_chunks( + self, chunktexts: List[str], params: Dict[str, Any] + ) -> List[SubjectSuggestion]: """Suggest subjects for the chunked text; should be implemented by the subclass inheriting this mixin""" pass # pragma: no cover - def _suggest(self, text, params): + def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) @@ -49,7 +57,7 @@ class TfidfVectorizerMixin: vectorizer = None - def initialize_vectorizer(self): + def initialize_vectorizer(self) -> None: if self.vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) if os.path.exists(path): @@ -61,7 +69,9 @@ def initialize_vectorizer(self): backend_id=self.backend_id, ) - def create_vectorizer(self, input, params={}): + def create_vectorizer( + self, input: Iterator[str], params: Dict[str, Any] = {} + ) -> csr_matrix: self.info("creating vectorizer") self.vectorizer = TfidfVectorizer(**params) veccorpus = self.vectorizer.fit_transform(input) From 6987c05f16687c89bf2b5883ebbe60b4f54085dc Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 26 May 2023 10:53:04 +0300 Subject: [PATCH 18/28] Manually annotate annif/corpus/document.py --- annif/corpus/document.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 54a0a3ba6..87937ebdb 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -1,29 +1,40 @@ """Clases for supporting document corpora""" +from __future__ import annotations import glob import gzip import os.path import re from itertools import islice +from typing import TYPE_CHECKING, Iterator, Optional, Tuple, Union import annif.util from .subject import SubjectSet from .types import Document, DocumentCorpus +if TYPE_CHECKING: + from annif.corpus.subject import SubjectIndex + logger = annif.logger class DocumentDirectory(DocumentCorpus): """A directory of files as a full text document corpus""" - def __init__(self, path, subject_index=None, language=None, require_subjects=False): + def __init__( + self, + path: str, + subject_index: Optional[SubjectIndex] = None, + language: Optional[str] = None, + require_subjects: bool = False, + ) -> None: self.path = path self.subject_index = subject_index self.language = language self.require_subjects = require_subjects - def __iter__(self): + def __iter__(self) -> Iterator[Union[Tuple[str, str], Tuple[str, None]]]: """Iterate through the directory, yielding tuples of (docfile, subjectfile) containing file paths. If require_subjects is False, the subjectfile will be returned as None.""" @@ -42,7 +53,7 @@ def __iter__(self): yield (filename, None) @property - def documents(self): + def documents(self) -> Iterator[Document]: for docfilename, subjfilename in self: with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile: text = docfile.read() @@ -59,12 +70,12 @@ def documents(self): class DocumentFile(DocumentCorpus): """A TSV file as a corpus of documents with subjects""" - def __init__(self, path, subject_index): + def __init__(self, path: str, subject_index: SubjectIndex) -> None: self.path = path self.subject_index = subject_index @property - def documents(self): + def documents(self) -> Iterator[Document]: if self.path.endswith(".gz"): opener = gzip.open else: @@ -73,7 +84,7 @@ def documents(self): for line in tsvfile: yield from self._parse_tsv_line(line) - def _parse_tsv_line(self, line): + def _parse_tsv_line(self, line: str) -> Iterator[Document]: if "\t" in line: text, uris = line.split("\t", maxsplit=1) subject_ids = { From 76022aa35dc633be7ded841a3bf799d7644440da Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 29 May 2023 11:47:26 +0300 Subject: [PATCH 19/28] Upgrade to PEP 585 and PEP 604 typing features/syntax - Use standard collection types instead of types from Typing (PEP 585) - Write union types as X | Y (PEP 604) - Write optional values as X | None (PEP 604) --- annif/__init__.py | 8 ++--- annif/analyzer/analyzer.py | 5 ++-- annif/analyzer/spacy.py | 4 +-- annif/analyzer/voikko.py | 3 +- annif/backend/backend.py | 26 ++++++++-------- annif/backend/dummy.py | 8 ++--- annif/backend/ensemble.py | 22 +++++++------- annif/backend/fasttext.py | 16 +++++----- annif/backend/http.py | 10 +++---- annif/backend/hyperopt.py | 6 ++-- annif/backend/mixins.py | 13 ++++---- annif/backend/mllm.py | 17 ++++++----- annif/backend/nn_ensemble.py | 20 ++++++------- annif/backend/omikuji.py | 10 +++---- annif/backend/pav.py | 16 +++++----- annif/backend/stwfsa.py | 8 ++--- annif/backend/svc.py | 16 +++++----- annif/backend/tfidf.py | 7 +++-- annif/backend/yake.py | 24 +++++++-------- annif/cli_util.py | 26 ++++++++-------- annif/config.py | 13 ++++---- annif/corpus/combine.py | 4 +-- annif/corpus/document.py | 9 +++--- annif/corpus/skos.py | 19 ++++-------- annif/corpus/subject.py | 27 +++++++++-------- annif/corpus/types.py | 4 +-- annif/eval.py | 29 +++++++++--------- annif/exception.py | 6 ++-- annif/lexical/mllm.py | 53 +++++++++++++++++---------------- annif/lexical/tokenset.py | 10 +++---- annif/lexical/util.py | 6 ++-- annif/openapi/validation.py | 3 +- annif/project.py | 29 +++++++++--------- annif/registry.py | 14 ++++----- annif/rest.py | 45 ++++++++++++---------------- annif/suggestion.py | 17 ++++++----- annif/transform/__init__.py | 8 ++--- annif/transform/inputlimiter.py | 4 +-- annif/transform/langfilter.py | 6 ++-- annif/transform/transform.py | 16 +++++----- annif/vocab.py | 4 +-- 41 files changed, 286 insertions(+), 305 deletions(-) diff --git a/annif/__init__.py b/annif/__init__.py index f239f85bb..dc353634b 100644 --- a/annif/__init__.py +++ b/annif/__init__.py @@ -5,7 +5,7 @@ import logging import os import os.path -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING logging.basicConfig() logger = logging.getLogger("annif") @@ -18,7 +18,7 @@ from flask.app import Flask -def create_flask_app(config_name: Optional[str] = None) -> Flask: +def create_flask_app(config_name: str | None = None) -> Flask: """Create a Flask app to be used by the CLI.""" from flask import Flask @@ -30,7 +30,7 @@ def create_flask_app(config_name: Optional[str] = None) -> Flask: return app -def create_app(config_name: Optional[str] = None) -> Flask: +def create_app(config_name: str | None = None) -> Flask: """Create a Connexion app to be used for the API.""" # 'cxapp' here is the Connexion application that has a normal Flask app # as a property (cxapp.app) @@ -67,7 +67,7 @@ def create_app(config_name: Optional[str] = None) -> Flask: return cxapp.app -def _get_config_name(config_name: Optional[str]) -> str: +def _get_config_name(config_name: str | None) -> str: if config_name is None: config_name = os.environ.get("ANNIF_CONFIG") if config_name is None: diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 137a5db18..5ba876f9d 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -4,7 +4,6 @@ import abc import functools import unicodedata -from typing import List _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -21,7 +20,7 @@ def __init__(self, **kwargs) -> None: if _KEY_TOKEN_MIN_LENGTH in kwargs: self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH]) - def tokenize_sentences(self, text: str) -> List[str]: + def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences.""" import nltk.tokenize @@ -38,7 +37,7 @@ def is_valid_token(self, word: str) -> bool: return True return False - def tokenize_words(self, text: str, filter: bool = True) -> List[str]: + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: """Tokenize a piece of text (e.g. a sentence) into words. If filter=True (default), only return valid tokens (e.g. not punctuation, numbers or very short words)""" diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py index 6579e861b..b5e9cbc55 100644 --- a/annif/analyzer/spacy.py +++ b/annif/analyzer/spacy.py @@ -1,8 +1,6 @@ """spaCy analyzer for Annif which uses spaCy for lemmatization""" from __future__ import annotations -from typing import List - import annif.util from annif.exception import OperationFailedException @@ -31,7 +29,7 @@ def __init__(self, param: str, **kwargs) -> None: self.lowercase = False super().__init__(**kwargs) - def tokenize_words(self, text: str, filter: bool = True) -> List[str]: + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: lemmas = [ lemma for lemma in (token.lemma_ for token in self.nlp(text.strip())) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index 1006ce358..e6e693d65 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -2,7 +2,6 @@ from __future__ import annotations import functools -from typing import Dict, Optional import voikko.libvoikko @@ -17,7 +16,7 @@ def __init__(self, param: str, **kwargs) -> None: self.voikko = None super().__init__(**kwargs) - def __getstate__(self) -> Dict[str, Optional[str]]: + def __getstate__(self) -> dict[str, str | None]: """Return the state of the object for pickling purposes. The Voikko instance is set to None because as a ctypes object it cannot be pickled.""" diff --git a/annif/backend/backend.py b/annif/backend/backend.py index f69e1f55f..6742493b7 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -5,7 +5,7 @@ import os.path from datetime import datetime, timezone from glob import glob -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any from annif import logger from annif.suggestion import SuggestionBatch @@ -24,7 +24,7 @@ class AnnifBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"limit": 100} def __init__( - self, backend_id: str, config_params: Dict[str, Any], project: AnnifProject + self, backend_id: str, config_params: dict[str, Any], project: AnnifProject ) -> None: """Initialize backend with specific parameters. The parameters are a dict. Keys and values depend on the specific @@ -34,11 +34,11 @@ def __init__( self.project = project self.datadir = project.datadir - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: return self.DEFAULT_PARAMETERS @property - def params(self) -> Dict[str, Any]: + def params(self) -> dict[str, Any]: params = {} params.update(self.default_params()) params.update(self.config_params) @@ -49,7 +49,7 @@ def is_trained(self) -> bool: return bool(glob(os.path.join(self.datadir, "*"))) @property - def modification_time(self) -> Optional[datetime.datetime]: + def modification_time(self) -> datetime.datetime | None: mtimes = [ datetime.utcfromtimestamp(os.path.getmtime(p)) for p in glob(os.path.join(self.datadir, "*")) @@ -61,8 +61,8 @@ def modification_time(self) -> Optional[datetime.datetime]: def _get_backend_params( self, - params: Optional[Dict[str, Any]], - ) -> Dict[str, Any]: + params: dict[str, Any] | None, + ) -> dict[str, Any]: backend_params = dict(self.params) if params is not None: backend_params.update(params) @@ -71,7 +71,7 @@ def _get_backend_params( def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: """This method can be overridden by backends. It implements @@ -81,7 +81,7 @@ def _train( def train( self, corpus: DocumentCorpus, - params: Optional[Dict[str, Any]] = None, + params: dict[str, Any] | None = None, jobs: int = 0, ) -> None: """Train the model on the given document or subject corpus.""" @@ -102,7 +102,7 @@ def _suggest(self, text, params): pass # pragma: no cover def _suggest_batch( - self, texts: List[str], params: Dict[str, Any] + self, texts: list[str], params: dict[str, Any] ) -> SuggestionBatch: """This method can be implemented by backends to use batching of documents in their operations. This default implementation uses the regular suggest @@ -115,8 +115,8 @@ def _suggest_batch( def suggest( self, - texts: List[str], - params: Optional[Dict[str, Any]] = None, + texts: list[str], + params: dict[str, Any] | None = None, ) -> SuggestionBatch: """Suggest subjects for the input documents and return a list of subject sets represented as a list of SubjectSuggestion objects.""" @@ -149,7 +149,7 @@ def _learn(self, corpus, params): def learn( self, corpus: DocumentCorpus, - params: Optional[Dict[str, Any]] = None, + params: dict[str, Any] | None = None, ) -> None: """Further train the model on the given document or subject corpus.""" beparams = self._get_backend_params(params) diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index b7a0fd357..5f62517a5 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -1,7 +1,7 @@ """Dummy backend for testing basic interaction of projects and backends""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List +from typing import TYPE_CHECKING, Any from annif.suggestion import SubjectSuggestion @@ -18,13 +18,13 @@ class DummyBackend(backend.AnnifLearningBackend): is_trained = True modification_time = None - def default_params(self) -> Dict[str, int]: + def default_params(self) -> dict[str, int]: return backend.AnnifBackend.DEFAULT_PARAMETERS def initialize(self, parallel: bool = False) -> None: self.initialized = True - def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion]: score = float(params.get("score", 1.0)) # Ensure tests fail if "text" with wrong type ends up here @@ -45,7 +45,7 @@ def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion] def _learn( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], ) -> None: # in this dummy backend we "learn" by picking up the subject ID # of the first subject of the first document in the learning set diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index a8a93d833..9b22d915e 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -1,7 +1,7 @@ """Ensemble backend that combines results from multiple projects""" from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any import annif.eval import annif.parallel @@ -22,7 +22,7 @@ class BaseEnsembleBackend(backend.AnnifBackend): """Base class for ensemble backends""" - def _get_sources_attribute(self, attr: str) -> List[Optional[bool]]: + def _get_sources_attribute(self, attr: str) -> list[bool | None]: params = self._get_backend_params(None) sources = annif.util.parse_sources(params["sources"]) return [ @@ -38,8 +38,8 @@ def initialize(self, parallel: bool = False) -> None: project.initialize(parallel) def _suggest_with_sources( - self, texts: List[str], sources: List[Tuple[str, float]] - ) -> Dict[str, SuggestionBatch]: + self, texts: list[str], sources: list[tuple[str, float]] + ) -> dict[str, SuggestionBatch]: return { project_id: self.project.registry.get_project(project_id).suggest(texts) for project_id, _ in sources @@ -47,9 +47,9 @@ def _suggest_with_sources( def _merge_source_batches( self, - batch_by_source: Dict[str, SuggestionBatch], - sources: List[Tuple[str, float]], - params: Dict[str, Any], + batch_by_source: dict[str, SuggestionBatch], + sources: list[tuple[str, float]], + params: dict[str, Any], ) -> SuggestionBatch: """Merge the given SuggestionBatches from each source into a single SuggestionBatch. The default implementation computes a weighted @@ -63,7 +63,7 @@ def _merge_source_batches( ) def _suggest_batch( - self, texts: List[str], params: Dict[str, Any] + self, texts: list[str], params: dict[str, Any] ) -> SuggestionBatch: sources = annif.util.parse_sources(params["sources"]) batch_by_source = self._suggest_with_sources(texts, sources) @@ -109,11 +109,11 @@ def _prepare(self, n_jobs: int = 1) -> None: self._source_batches.append(suggestions) self._gold_batches.append(gold_batch) - def _normalize(self, hps: Dict[str, float]) -> Dict[str, float]: + def _normalize(self, hps: dict[str, float]) -> dict[str, float]: total = sum(hps.values()) return {source: hps[source] / total for source in hps} - def _format_cfg_line(self, hps: Dict[str, float]) -> str: + def _format_cfg_line(self, hps: dict[str, float]) -> str: return "sources=" + ",".join( [f"{src}:{weight:.4f}" for src, weight in hps.items()] ) @@ -159,5 +159,5 @@ def get_hp_optimizer( ) -> EnsembleOptimizer: return EnsembleOptimizer(self, corpus, metric) - def _train(self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0): + def _train(self, corpus: DocumentCorpus, params: dict[str, Any], jobs: int = 0): raise NotSupportedException("Training ensemble backend is not possible.") diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index fd5ed770b..23c33539a 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -3,7 +3,7 @@ import collections import os.path -from typing import TYPE_CHECKING, Any, Dict, List, Tuple +from typing import TYPE_CHECKING, Any import fasttext @@ -56,7 +56,7 @@ class FastTextBackend(mixins.ChunkingBackend, backend.AnnifBackend): # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(mixins.ChunkingBackend.DEFAULT_PARAMETERS) params.update(self.DEFAULT_PARAMETERS) @@ -119,7 +119,7 @@ def _create_train_file( corpus, self.datadir, self.TRAIN_FILE, method=self._write_train_file ) - def _create_model(self, params: Dict[str, Any], jobs: int) -> None: + def _create_model(self, params: dict[str, Any], jobs: int) -> None: self.info("creating fastText model") trainpath = os.path.join(self.datadir, self.TRAIN_FILE) modelpath = os.path.join(self.datadir, self.MODEL_FILE) @@ -137,7 +137,7 @@ def _create_model(self, params: Dict[str, Any], jobs: int) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: if corpus != "cached": @@ -151,8 +151,8 @@ def _train( self._create_model(params, jobs) def _predict_chunks( - self, chunktexts: List[str], limit: int - ) -> Tuple[List[List[str]], List[ndarray]]: + self, chunktexts: list[str], limit: int + ) -> tuple[list[list[str]], list[ndarray]]: return self._model.predict( list( filter( @@ -163,8 +163,8 @@ def _predict_chunks( ) def _suggest_chunks( - self, chunktexts: List[str], params: Dict[str, Any] - ) -> List[SubjectSuggestion]: + self, chunktexts: list[str], params: dict[str, Any] + ) -> list[SubjectSuggestion]: limit = int(params["limit"]) chunklabels, chunkscores = self._predict_chunks(chunktexts, limit) label_scores = collections.defaultdict(float) diff --git a/annif/backend/http.py b/annif/backend/http.py index 85298bbee..f57511f64 100644 --- a/annif/backend/http.py +++ b/annif/backend/http.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any import dateutil.parser import requests @@ -23,7 +23,7 @@ class HTTPBackend(backend.AnnifBackend): _headers = None @property - def headers(self) -> Dict[str, str]: + def headers(self) -> dict[str, str]: if self._headers is None: version = importlib.metadata.version("annif") self._headers = { @@ -36,13 +36,13 @@ def is_trained(self) -> bool: return self._get_project_info("is_trained") @property - def modification_time(self) -> Optional[datetime]: + def modification_time(self) -> datetime | None: mtime = self._get_project_info("modification_time") if mtime is None: return None return dateutil.parser.parse(mtime) - def _get_project_info(self, key: str) -> Optional[Union[bool, str]]: + def _get_project_info(self, key: str) -> bool | str | None: params = self._get_backend_params(None) try: req = requests.get( @@ -63,7 +63,7 @@ def _get_project_info(self, key: str) -> Optional[Union[bool, str]]: else: return None - def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion]: data = {"text": text} if "project" in params: data["project"] = params["project"] diff --git a/annif/backend/hyperopt.py b/annif/backend/hyperopt.py index d05841e2f..7e1506c03 100644 --- a/annif/backend/hyperopt.py +++ b/annif/backend/hyperopt.py @@ -4,7 +4,7 @@ import abc import collections import warnings -from typing import TYPE_CHECKING, Callable, Dict, Optional +from typing import TYPE_CHECKING, Callable import optuna import optuna.exceptions @@ -77,13 +77,13 @@ def _postprocess(self, study: Study) -> HPRecommendation: """Convert the study results into hyperparameter recommendations""" pass # pragma: no cover - def _normalize(self, hps: Dict[str, float]) -> Dict[str, float]: + def _normalize(self, hps: dict[str, float]) -> dict[str, float]: """Normalize the given raw hyperparameters. Intended to be overridden by subclasses when necessary. The default is to keep them as-is.""" return hps def optimize( - self, n_trials: int, n_jobs: int, results_file: Optional[LazyFile] + self, n_trials: int, n_jobs: int, results_file: LazyFile | None ) -> HPRecommendation: """Find the optimal hyperparameters by testing up to the given number of hyperparameter combinations""" diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 525dc18e4..942f9cad0 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -3,7 +3,8 @@ import abc import os.path -from typing import TYPE_CHECKING, Any, Dict, Iterator, List +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any import joblib from sklearn.feature_extraction.text import TfidfVectorizer @@ -22,19 +23,19 @@ class ChunkingBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"chunksize": 1} - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: return self.DEFAULT_PARAMETERS @abc.abstractmethod def _suggest_chunks( - self, chunktexts: List[str], params: Dict[str, Any] - ) -> List[SubjectSuggestion]: + self, chunktexts: list[str], params: dict[str, Any] + ) -> list[SubjectSuggestion]: """Suggest subjects for the chunked text; should be implemented by the subclass inheriting this mixin""" pass # pragma: no cover - def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion]: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) @@ -70,7 +71,7 @@ def initialize_vectorizer(self) -> None: ) def create_vectorizer( - self, input: Iterator[str], params: Dict[str, Any] = {} + self, input: Iterator[str], params: dict[str, Any] = {} ) -> csr_matrix: self.info("creating vectorizer") self.vectorizer = TfidfVectorizer(**params) diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 138f98282..2d4337c6e 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -2,7 +2,8 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Tuple +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any import joblib import numpy as np @@ -93,7 +94,7 @@ class MLLMBackend(hyperopt.AnnifHyperoptBackend): def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> MLLMOptimizer: return MLLMOptimizer(self, corpus, metric) - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -108,7 +109,7 @@ def _load_model(self) -> MLLMModel: "model {} not found".format(path), backend_id=self.backend_id ) - def _load_train_data(self) -> Tuple[np.ndarray, np.ndarray]: + def _load_train_data(self) -> tuple[np.ndarray, np.ndarray]: path = os.path.join(self.datadir, self.TRAIN_FILE) if os.path.exists(path): return joblib.load(path) @@ -124,7 +125,7 @@ def initialize(self, parallel: bool = False) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: self.info("starting train") @@ -152,20 +153,20 @@ def _train( self.info("saving model") annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE) - def _generate_candidates(self, text: str) -> List[Candidate]: + def _generate_candidates(self, text: str) -> list[Candidate]: return self._model.generate_candidates(text, self.project.analyzer) def _prediction_to_result( self, - prediction: List[Tuple[np.float64, int]], - params: Dict[str, Any], + prediction: list[tuple[np.float64, int]], + params: dict[str, Any], ) -> Iterator: vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: vector[subject_id] = score return vector_to_suggestions(vector, int(params["limit"])) - def _suggest(self, text: str, params: Dict[str, Any]) -> Iterator: + def _suggest(self, text: str, params: dict[str, Any]) -> Iterator: candidates = self._generate_candidates(text) prediction = self._model.predict(candidates) return self._prediction_to_result(prediction, params) diff --git a/annif/backend/nn_ensemble.py b/annif/backend/nn_ensemble.py index 5fb82fb28..658bd79be 100644 --- a/annif/backend/nn_ensemble.py +++ b/annif/backend/nn_ensemble.py @@ -5,7 +5,7 @@ import os.path import shutil from io import BytesIO -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any import joblib import lmdb @@ -35,7 +35,7 @@ def idx_to_key(idx: int) -> bytes: return b"%08d" % idx -def key_to_idx(key: Union[memoryview, bytes]) -> int: +def key_to_idx(key: memoryview | bytes) -> int: """convert a binary LMDB key to an integer index""" return int(key) @@ -64,7 +64,7 @@ def add_sample(self, inputs: np.ndarray, targets: np.ndarray) -> None: buf.seek(0) self._txn.put(key, buf.read()) - def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: + def __getitem__(self, idx: int) -> tuple[np.ndarray, np.ndarray]: """get a particular batch of samples""" cursor = self._txn.cursor() first_key = idx * self._batch_size @@ -112,7 +112,7 @@ class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBacke # defaults for uninitialized instances _model = None - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -138,9 +138,9 @@ def initialize(self, parallel: bool = False) -> None: def _merge_source_batches( self, - batch_by_source: Dict[str, SuggestionBatch], - sources: List[Tuple[str, float]], - params: Dict[str, Any], + batch_by_source: dict[str, SuggestionBatch], + sources: list[tuple[str, float]], + params: dict[str, Any], ) -> SuggestionBatch: src_weight = dict(sources) score_vectors = np.array( @@ -164,7 +164,7 @@ def _merge_source_batches( self.project.subjects, ) - def _create_model(self, sources: List[Tuple[str, float]]) -> None: + def _create_model(self, sources: list[tuple[str, float]]) -> None: self.info("creating NN ensemble model") inputs = Input(shape=(len(self.project.subjects), len(sources))) @@ -199,7 +199,7 @@ def _create_model(self, sources: List[Tuple[str, float]]) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: sources = annif.util.parse_sources(self.params["sources"]) @@ -286,7 +286,7 @@ def _fit_model( def _learn( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], ) -> None: self.initialize() self._fit_model( diff --git a/annif/backend/omikuji.py b/annif/backend/omikuji.py index 7c47c1b8a..6c864b89e 100644 --- a/annif/backend/omikuji.py +++ b/annif/backend/omikuji.py @@ -3,7 +3,7 @@ import os.path import shutil -from typing import TYPE_CHECKING, Any, Dict, List +from typing import TYPE_CHECKING, Any import omikuji @@ -43,7 +43,7 @@ class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): "collapse_every_n_layers": 0, } - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -96,7 +96,7 @@ def _create_train_file(self, veccorpus: csr_matrix, corpus: DocumentCorpus) -> N trainfile.seek(0) print("{:08d}".format(n_samples), end="", file=trainfile) - def _create_model(self, params: Dict[str, Any], jobs: int) -> None: + def _create_model(self, params: dict[str, Any], jobs: int) -> None: train_path = os.path.join(self.datadir, self.TRAIN_FILE) model_path = os.path.join(self.datadir, self.MODEL_FILE) hyper_param = omikuji.Model.default_hyper_param() @@ -114,7 +114,7 @@ def _create_model(self, params: Dict[str, Any], jobs: int) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: if corpus != "cached": @@ -135,7 +135,7 @@ def _train( self._create_model(params, jobs) def _suggest_batch( - self, texts: List[str], params: Dict[str, Any] + self, texts: list[str], params: dict[str, Any] ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) limit = int(params["limit"]) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index fc0571274..da8a6e2c1 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -5,7 +5,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Any, Dict, List, Tuple +from typing import TYPE_CHECKING, Any import joblib import numpy as np @@ -36,7 +36,7 @@ class PAVBackend(ensemble.BaseEnsembleBackend): DEFAULT_PARAMETERS = {"min-docs": 10} - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -59,15 +59,15 @@ def initialize(self, parallel: bool = False) -> None: backend_id=self.backend_id, ) - def _get_model(self, source_project_id: str) -> Dict[int, IsotonicRegression]: + def _get_model(self, source_project_id: str) -> dict[int, IsotonicRegression]: self.initialize() return self._models[source_project_id] def _merge_source_batches( self, - batch_by_source: Dict[str, SuggestionBatch], - sources: List[Tuple[str, float]], - params: Dict[str, Any], + batch_by_source: dict[str, SuggestionBatch], + sources: list[tuple[str, float]], + params: dict[str, Any], ) -> SuggestionBatch: reg_batch_by_source = {} for project_id, batch in batch_by_source.items(): @@ -95,7 +95,7 @@ def _merge_source_batches( @staticmethod def _suggest_train_corpus( source_project: AnnifProject, corpus: DocumentCorpus - ) -> Tuple[csc_matrix, csc_matrix]: + ) -> tuple[csc_matrix, csc_matrix]: # lists for constructing score matrix data, row, col = [], [], [] # lists for constructing true label matrix @@ -156,7 +156,7 @@ def _create_pav_model( def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: if corpus == "cached": diff --git a/annif/backend/stwfsa.py b/annif/backend/stwfsa.py index 69e93d770..fdc962b11 100644 --- a/annif/backend/stwfsa.py +++ b/annif/backend/stwfsa.py @@ -1,7 +1,7 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Any, Dict, List, Tuple +from typing import TYPE_CHECKING, Any from stwfsapy.predictor import StwfsapyPredictor @@ -77,7 +77,7 @@ def initialize(self, parallel: bool = False) -> None: f"Model not found at {path}", backend_id=self.backend_id ) - def _load_data(self, corpus: DocumentCorpus) -> Tuple[List[str], List[List[str]]]: + def _load_data(self, corpus: DocumentCorpus) -> tuple[list[str], list[list[str]]]: if corpus == "cached": raise NotSupportedException( "Training stwfsa project from cached data not supported." @@ -102,7 +102,7 @@ def _load_data(self, corpus: DocumentCorpus) -> Tuple[List[str], List[List[str]] def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: X, y = self._load_data(corpus) @@ -125,7 +125,7 @@ def _train( lambda model, store_path: model.store(store_path), ) - def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') result = self._model.suggest_proba([text])[0] suggestions = [] diff --git a/annif/backend/svc.py b/annif/backend/svc.py index 257afeca8..1e7932c3e 100644 --- a/annif/backend/svc.py +++ b/annif/backend/svc.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, Any, Dict, List, Tuple +from typing import TYPE_CHECKING, Any import joblib import numpy as np @@ -33,7 +33,7 @@ class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): DEFAULT_PARAMETERS = {"min_df": 1, "ngram": 1} - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -55,7 +55,7 @@ def initialize(self, parallel: bool = False) -> None: def _corpus_to_texts_and_classes( self, corpus: DocumentCorpus - ) -> Tuple[List[str], List[int]]: + ) -> tuple[list[str], list[int]]: texts = [] classes = [] for doc in corpus.documents: @@ -70,7 +70,7 @@ def _corpus_to_texts_and_classes( classes.append(doc.subject_set[0]) return texts, classes - def _train_classifier(self, veccorpus: csr_matrix, classes: List[int]) -> None: + def _train_classifier(self, veccorpus: csr_matrix, classes: list[int]) -> None: self.info("creating classifier") self._model = LinearSVC() self._model.fit(veccorpus, classes) @@ -79,7 +79,7 @@ def _train_classifier(self, veccorpus: csr_matrix, classes: List[int]) -> None: ) def _train( - self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0 + self, corpus: DocumentCorpus, params: dict[str, Any], jobs: int = 0 ) -> None: if corpus == "cached": raise NotSupportedException( @@ -97,8 +97,8 @@ def _train( self._train_classifier(veccorpus, classes) def _scores_to_suggestions( - self, scores: np.ndarray, params: Dict[str, Any] - ) -> List[SubjectSuggestion]: + self, scores: np.ndarray, params: dict[str, Any] + ) -> list[SubjectSuggestion]: results = [] limit = int(params["limit"]) for class_id in np.argsort(scores)[::-1][:limit]: @@ -110,7 +110,7 @@ def _scores_to_suggestions( return results def _suggest_batch( - self, texts: List[str], params: Dict[str, Any] + self, texts: list[str], params: dict[str, Any] ) -> SuggestionBatch: vector = self.vectorizer.transform(texts) confidences = self._model.decision_function(vector) diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index 89ddbdcf2..bf2f6c40d 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -4,7 +4,8 @@ import os.path import tempfile -from typing import TYPE_CHECKING, Any, Dict, Iterator +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any import gensim.similarities from gensim.matutils import Sparse2Corpus @@ -112,7 +113,7 @@ def _create_index(self, veccorpus: csr_matrix) -> None: def _train( self, corpus: DocumentCorpus, - params: Dict[str, Any], + params: dict[str, Any], jobs: int = 0, ) -> None: if corpus == "cached": @@ -126,7 +127,7 @@ def _train( veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus) - def _suggest(self, text: str, params: Dict[str, Any]) -> Iterator: + def _suggest(self, text: str, params: dict[str, Any]) -> Iterator: self.debug( 'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text)) ) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index b36b6ec1c..5d853f4c5 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -6,7 +6,7 @@ import os.path import re from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Any import joblib import yake @@ -46,7 +46,7 @@ class YakeBackend(backend.AnnifBackend): "remove_parentheses": False, } - def default_params(self) -> Dict[str, Any]: + def default_params(self) -> dict[str, Any]: params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() params.update(self.DEFAULT_PARAMETERS) return params @@ -56,7 +56,7 @@ def is_trained(self): return True @property - def label_types(self) -> List[URIRef]: + def label_types(self) -> list[URIRef]: if type(self.params["label_types"]) == str: # Label types set by user label_types = [lt.strip() for lt in self.params["label_types"].split(",")] self._validate_label_types(label_types) @@ -64,7 +64,7 @@ def label_types(self) -> List[URIRef]: label_types = self.params["label_types"] # The defaults return [getattr(SKOS, lt) for lt in label_types] - def _validate_label_types(self, label_types: List[str]) -> None: + def _validate_label_types(self, label_types: list[str]) -> None: for lt in label_types: if lt not in ("prefLabel", "altLabel", "hiddenLabel"): raise ConfigurationException( @@ -91,7 +91,7 @@ def _save_index(self, path: str) -> None: self._index, self.datadir, self.INDEX_FILE, method=joblib.dump ) - def _create_index(self) -> Dict[str, Set[str]]: + def _create_index(self) -> dict[str, set[str]]: index = defaultdict(set) skos_vocab = self.project.vocab.skos for concept in skos_vocab.concepts: @@ -117,7 +117,7 @@ def _sort_phrase(self, phrase: str) -> str: words = phrase.split() return " ".join(sorted(words)) - def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion]: + def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion]: self.debug(f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params["limit"]) @@ -141,8 +141,8 @@ def _suggest(self, text: str, params: Dict[str, Any]) -> List[SubjectSuggestion] return subject_suggestions def _keyphrases2suggestions( - self, keyphrases: List[Tuple[str, float64]] - ) -> List[Tuple[str, float64]]: + self, keyphrases: list[tuple[str, float64]] + ) -> list[tuple[str, float64]]: suggestions = [] not_matched = [] for kp, score in keyphrases: @@ -164,7 +164,7 @@ def _keyphrases2suggestions( ) return suggestions - def _keyphrase2uris(self, keyphrase: str) -> Set[str]: + def _keyphrase2uris(self, keyphrase: str) -> set[str]: keyphrase = self._normalize_phrase(keyphrase) keyphrase = self._sort_phrase(keyphrase) return self._index.get(keyphrase, []) @@ -174,8 +174,8 @@ def _transform_score(self, score: float64) -> float64: return 1.0 / (score + 1) def _combine_suggestions( - self, suggestions: List[Tuple[str, float], Tuple[str, float64]] - ) -> List[Tuple[str, float], Tuple[str, float64]]: + self, suggestions: list[tuple[str, float], tuple[str, float64]] + ) -> list[tuple[str, float], tuple[str, float64]]: combined_suggestions = {} for uri, score in suggestions: if uri not in combined_suggestions: @@ -192,5 +192,5 @@ def _combine_scores(self, score1: float, score2: float) -> float: confl = score1 * score2 / (score1 * score2 + (1 - score1) * (1 - score2)) return (confl - 0.5) * 2 - def _train(self, corpus: DocumentCorpus, params: Dict[str, Any], jobs: int = 0): + def _train(self, corpus: DocumentCorpus, params: dict[str, Any], jobs: int = 0): raise NotSupportedException("Training yake backend is not possible.") diff --git a/annif/cli_util.py b/annif/cli_util.py index 6dad84790..4d636abc6 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -5,7 +5,7 @@ import itertools import os import sys -from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING import click import click_log @@ -31,7 +31,7 @@ def _set_project_config_file_path( - ctx: Context, param: Option, value: Optional[str] + ctx: Context, param: Option, value: str | None ) -> None: """Override the default path or the path given in env by CLI option""" with ctx.obj.load_app().app_context(): @@ -120,7 +120,7 @@ def make_list_template(*rows) -> str: ) -def format_datetime(dt: Optional[datetime]) -> str: +def format_datetime(dt: datetime | None) -> str: """Helper function to format a datetime object as a string in the local time.""" if dt is None: return "-" @@ -128,10 +128,10 @@ def format_datetime(dt: Optional[datetime]) -> str: def open_documents( - paths: Union[Tuple[str, ...], Tuple[()]], + paths: tuple[str, ...] | tuple[()], subject_index: SubjectIndex, vocab_lang: str, - docs_limit: Optional[int], + docs_limit: int | None, ) -> DocumentCorpus: """Helper function to open a document corpus from a list of pathnames, each of which is either a TSV file or a directory of TXT files. For @@ -160,9 +160,7 @@ def open_doc_path(path, subject_index): return docs -def open_text_documents( - paths: Tuple[str, ...], docs_limit: Optional[int] -) -> DocumentList: +def open_text_documents(paths: tuple[str, ...], docs_limit: int | None) -> DocumentList: """ Helper function to read text documents from the given file paths. Returns a DocumentList object with Documents having no subjects. If a path is "-", the @@ -186,7 +184,7 @@ def show_hits( hits: SuggestionResult, project: AnnifProject, lang: str, - file: Optional[TextIOWrapper] = None, + file: TextIOWrapper | None = None, ) -> None: """ Print subject suggestions to the console or a file. The suggestions are displayed as @@ -205,8 +203,8 @@ def show_hits( def parse_backend_params( - backend_param: Union[Tuple[str, ...], Tuple[()]], project: AnnifProject -) -> DefaultDict[str, Dict[str, str]]: + backend_param: tuple[str, ...] | tuple[()], project: AnnifProject +) -> collections.defaultdict[str, dict[str, str]]: """Parse a list of backend parameters given with the --backend-param option into a nested dict structure""" backend_params = collections.defaultdict(dict) @@ -226,7 +224,7 @@ def _validate_backend_params(backend: str, beparam: str, project: AnnifProject) ) -def generate_filter_params(filter_batch_max_limit: int) -> List[Tuple[int, float]]: +def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float]]: limits = range(1, filter_batch_max_limit + 1) thresholds = [i * 0.05 for i in range(20)] return list(itertools.product(limits, thresholds)) @@ -234,7 +232,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> List[Tuple[int, float def _get_completion_choices( param: Argument, -) -> Dict[str, Union[AnnifVocabulary, AnnifProject]]: +) -> dict[str, AnnifVocabulary | AnnifProject]: if param.name == "project_id": return annif.registry.get_projects() elif param.name == "vocab_id": @@ -243,7 +241,7 @@ def _get_completion_choices( return [] -def complete_param(ctx: Context, param: Argument, incomplete: str) -> List[str]: +def complete_param(ctx: Context, param: Argument, incomplete: str) -> list[str]: with ctx.obj.load_app().app_context(): return [ choice diff --git a/annif/config.py b/annif/config.py index 2a4f0ac9a..ab8f0d568 100644 --- a/annif/config.py +++ b/annif/config.py @@ -4,7 +4,6 @@ import configparser import os.path from glob import glob -from typing import Dict, List, Optional, Union import tomli @@ -32,7 +31,7 @@ def __init__(self, filename: str) -> None: raise ConfigurationException(err.message) @property - def project_ids(self) -> List[str]: + def project_ids(self) -> list[str]: return self._config.sections() def __getitem__(self, key: str) -> configparser.SectionProxy: @@ -56,7 +55,7 @@ def __init__(self, filename: str) -> None: def project_ids(self): return self._config.keys() - def __getitem__(self, key: str) -> Dict[str, str]: + def __getitem__(self, key: str) -> dict[str, str]: return self._config[key] @@ -87,11 +86,11 @@ def _check_duplicate_project_ids(self, proj_id: str, file: str) -> None: def project_ids(self): return self._config.keys() - def __getitem__(self, key: str) -> Union[Dict[str, str], configparser.SectionProxy]: + def __getitem__(self, key: str) -> dict[str, str] | configparser.SectionProxy: return self._config[key] -def check_config(projects_config_path: str) -> Optional[str]: +def check_config(projects_config_path: str) -> str | None: if os.path.exists(projects_config_path): return projects_config_path else: @@ -105,7 +104,7 @@ def check_config(projects_config_path: str) -> Optional[str]: return None -def find_config() -> Optional[str]: +def find_config() -> str | None: for path in ("projects.cfg", "projects.toml", "projects.d"): if os.path.exists(path): return path @@ -122,7 +121,7 @@ def find_config() -> Optional[str]: def parse_config( projects_config_path: str, -) -> Optional[Union[AnnifConfigDirectory, AnnifConfigCFG, AnnifConfigTOML]]: +) -> AnnifConfigDirectory | AnnifConfigCFG | AnnifConfigTOML | None: if projects_config_path: projects_config_path = check_config(projects_config_path) else: diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index 067c316e3..75fcc7f55 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -2,7 +2,7 @@ from __future__ import annotations import itertools -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING from .types import DocumentCorpus @@ -14,7 +14,7 @@ class CombinedCorpus(DocumentCorpus): """Class for combining multiple corpora so they behave like a single corpus""" - def __init__(self, corpora: List[DocumentFile]) -> None: + def __init__(self, corpora: list[DocumentFile]) -> None: self._corpora = corpora @property diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 87937ebdb..78ea838d8 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -5,8 +5,9 @@ import gzip import os.path import re +from collections.abc import Iterator from itertools import islice -from typing import TYPE_CHECKING, Iterator, Optional, Tuple, Union +from typing import TYPE_CHECKING import annif.util @@ -25,8 +26,8 @@ class DocumentDirectory(DocumentCorpus): def __init__( self, path: str, - subject_index: Optional[SubjectIndex] = None, - language: Optional[str] = None, + subject_index: SubjectIndex | None = None, + language: str | None = None, require_subjects: bool = False, ) -> None: self.path = path @@ -34,7 +35,7 @@ def __init__( self.language = language self.require_subjects = require_subjects - def __iter__(self) -> Iterator[Union[Tuple[str, str], Tuple[str, None]]]: + def __iter__(self) -> Iterator[tuple[str, str] | tuple[str, None]]: """Iterate through the directory, yielding tuples of (docfile, subjectfile) containing file paths. If require_subjects is False, the subjectfile will be returned as None.""" diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 6a5fb3f15..97df7d700 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -4,16 +4,9 @@ import collections import os.path import shutil -from typing import ( - TYPE_CHECKING, - DefaultDict, - Dict, - Iterator, - List, - Sequence, - Set, - Union, -) +from collections import defaultdict +from collections.abc import Iterator, Sequence +from typing import TYPE_CHECKING import rdflib import rdflib.util @@ -76,7 +69,7 @@ def __init__(self, path: str) -> None: self.graph.parse(self.path, format=rdflib.util.guess_format(self.path)) @property - def languages(self) -> Set[str]: + def languages(self) -> set[str]: if self._languages is None: self._languages = { label.language @@ -87,7 +80,7 @@ def languages(self) -> Set[str]: } return self._languages - def _concept_labels(self, concept: URIRef) -> Dict[str, str]: + def _concept_labels(self, concept: URIRef) -> dict[str, str]: by_lang = self.get_concept_labels(concept, self.PREF_LABEL_PROPERTIES) return { lang: by_lang[lang][0] @@ -120,7 +113,7 @@ def get_concept_labels( self, concept: URIRef, label_types: Sequence[URIRef], - ) -> Union[DefaultDict[str, List[str]], DefaultDict[None, List[str]]]: + ) -> defaultdict[str, list[str]] | defaultdict[None, list[str]]: """return all the labels of the given concept with the given label properties as a dict-like object where the keys are language codes and the values are lists of labels in that language""" diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 0ef5d2b73..045867990 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -3,7 +3,8 @@ import csv import os.path -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple, Union +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any import annif import annif.util @@ -37,7 +38,7 @@ def _parse_line(self, line: str) -> Iterator[Subject]: yield Subject(uri=clean_uri, labels=labels, notation=notation) @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: return [self.language] @property @@ -59,7 +60,7 @@ def __init__(self, path: str) -> None: """initialize the SubjectFileCSV given a path to a CSV file""" self.path = path - def _parse_row(self, row: Dict[str, str]) -> Iterator[Subject]: + def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]: labels = { fname.replace("label_", ""): value or None for fname, value in row.items() @@ -78,7 +79,7 @@ def _parse_row(self, row: Dict[str, str]) -> Iterator[Subject]: ) @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: # infer the supported languages from the CSV column names with open(self.path, encoding="utf-8-sig") as csvfile: reader = csv.reader(csvfile) @@ -130,10 +131,10 @@ def __len__(self) -> int: return len(self._subjects) @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: return self._languages - def __getitem__(self, subject_id: Union[int, np.int32]) -> Subject: + def __getitem__(self, subject_id: int | np.int32) -> Subject: return self._subjects[subject_id] def append(self, subject: Subject) -> None: @@ -150,7 +151,7 @@ def append(self, subject: Subject) -> None: def contains_uri(self, uri: str) -> bool: return uri in self._uri_idx - def by_uri(self, uri: str, warnings: bool = True) -> Optional[int]: + def by_uri(self, uri: str, warnings: bool = True) -> int | None: """return the subject ID of a subject by its URI, or None if not found. If warnings=True, log a warning message if the URI cannot be found.""" try: @@ -160,7 +161,7 @@ def by_uri(self, uri: str, warnings: bool = True) -> Optional[int]: logger.warning("Unknown subject URI <%s>", uri) return None - def by_label(self, label: Optional[str], language: str) -> Optional[int]: + def by_label(self, label: str | None, language: str) -> int | None: """return the subject ID of a subject by its label in a given language""" try: @@ -169,7 +170,7 @@ def by_label(self, label: Optional[str], language: str) -> Optional[int]: logger.warning('Unknown subject label "%s"@%s', label, language) return None - def deprecated_ids(self) -> List[int]: + def deprecated_ids(self) -> list[int]: """return indices of deprecated subjects""" return [ @@ -179,7 +180,7 @@ def deprecated_ids(self) -> List[int]: ] @property - def active(self) -> List[Tuple[int, Subject]]: + def active(self) -> list[tuple[int, Subject]]: """return a list of (subject_id, subject) tuples of all subjects that are not deprecated""" @@ -217,7 +218,7 @@ def load(cls, path: str) -> SubjectIndex: class SubjectSet: """Represents a set of subjects for a document.""" - def __init__(self, subject_ids: Optional[Any] = None) -> None: + def __init__(self, subject_ids: Any | None = None) -> None: """Create a SubjectSet and optionally initialize it from an iterable of subject IDs""" @@ -260,7 +261,7 @@ def from_string( @staticmethod def _parse_line( line: str, - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[str | None, str | None]: uri = label = None vals = line.split("\t") for val in vals: @@ -275,7 +276,7 @@ def _parse_line( return uri, label def as_vector( - self, size: Optional[int] = None, destination: Optional[np.ndarray] = None + self, size: int | None = None, destination: np.ndarray | None = None ) -> np.ndarray: """Return the hits as a one-dimensional NumPy array in sklearn multilabel indicator format. Use destination array if given (not diff --git a/annif/corpus/types.py b/annif/corpus/types.py index 3a7531174..de3c20db9 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -3,8 +3,8 @@ import abc import collections +from collections.abc import Iterator from itertools import islice -from typing import Iterator, List Document = collections.namedtuple("Document", "text subject_set") @@ -21,7 +21,7 @@ def documents(self): pass # pragma: no cover @property - def doc_batches(self) -> Iterator[List[Document]]: + def doc_batches(self) -> Iterator[list[Document]]: """Iterate through the document corpus in batches, yielding lists of Document objects.""" it = iter(self.documents) diff --git a/annif/eval.py b/annif/eval.py index 57a8e163e..d20b4b67b 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -2,7 +2,8 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Sequence, Union +from collections.abc import Iterator, Sequence +from typing import TYPE_CHECKING import numpy as np import scipy.sparse @@ -40,7 +41,7 @@ def false_negatives(y_true: csr_array, y_pred: csr_array) -> int: def dcg_score( - y_true: csr_array, y_pred: csr_array, limit: Optional[int] = None + y_true: csr_array, y_pred: csr_array, limit: int | None = None ) -> np.float64: """return the discounted cumulative gain (DCG) score for the selected labels vs. relevant labels""" @@ -56,9 +57,7 @@ def dcg_score( return (gain / discount).sum() -def ndcg_score( - y_true: csr_array, y_pred: csr_array, limit: Optional[int] = None -) -> float: +def ndcg_score(y_true: csr_array, y_pred: csr_array, limit: int | None = None) -> float: """return the normalized discounted cumulative gain (nDCG) score for the selected labels vs. relevant labels""" @@ -87,9 +86,9 @@ def __init__(self, subject_index: SubjectIndex) -> None: def evaluate_many( self, - suggestion_batch: Union[ - List[List[SubjectSuggestion]], SuggestionBatch, List[Iterator] - ], + suggestion_batch: list[list[SubjectSuggestion]] + | SuggestionBatch + | list[Iterator], gold_subject_batch: Sequence[SubjectSet], ) -> None: if not isinstance(suggestion_batch, SuggestionBatch): @@ -112,7 +111,7 @@ def _evaluate_samples( y_true: csr_array, y_pred: csr_array, metrics: Sequence[str] = [], - ) -> Dict[str, Union[np.float64, float, int]]: + ) -> dict[str, np.float64 | float | int]: y_pred_binary = y_pred > 0.0 # define the available metrics as lazy lambda functions @@ -183,7 +182,7 @@ def _evaluate_samples( return {metric: all_metrics[metric]() for metric in metrics} def _result_per_subject_header( - self, results_file: Union[LazyFile, TextIOWrapper] + self, results_file: LazyFile | TextIOWrapper ) -> None: print( "\t".join( @@ -203,7 +202,7 @@ def _result_per_subject_header( ) def _result_per_subject_body( - self, zipped_results: zip, results_file: Union[LazyFile, TextIOWrapper] + self, zipped_results: zip, results_file: LazyFile | TextIOWrapper ) -> None: for row in zipped_results: print("\t".join((str(e) for e in row)), file=results_file) @@ -212,7 +211,7 @@ def output_result_per_subject( self, y_true: csr_array, y_pred: csr_array, - results_file: Union[TextIOWrapper, LazyFile], + results_file: TextIOWrapper | LazyFile, language: str, ) -> None: """Write results per subject (non-aggregated) @@ -247,9 +246,9 @@ def output_result_per_subject( def results( self, metrics: Sequence[str] = [], - results_file: Optional[Union[LazyFile, TextIOWrapper]] = None, - language: Optional[str] = None, - ) -> Dict[str, Union[np.float64, float]]: + results_file: LazyFile | TextIOWrapper | None = None, + language: str | None = None, + ) -> dict[str, np.float64 | float]: """evaluate a set of selected subjects against a gold standard using different metrics. If metrics is empty, use all available metrics. If results_file (file object) given, write results per subject to it diff --git a/annif/exception.py b/annif/exception.py index 9f497324d..b4b9c6552 100644 --- a/annif/exception.py +++ b/annif/exception.py @@ -1,8 +1,6 @@ """Custom exceptions used by Annif""" from __future__ import annotations -from typing import Optional - from click import ClickException @@ -14,8 +12,8 @@ class AnnifException(ClickException): def __init__( self, message: str, - project_id: Optional[str] = None, - backend_id: Optional[str] = None, + project_id: str | None = None, + backend_id: str | None = None, ) -> None: super().__init__(message) self.project_id = project_id diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 36ba662de..86f87e698 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -3,9 +3,10 @@ import collections import math +from collections import defaultdict from enum import IntEnum from statistics import mean -from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Tuple, Union +from typing import TYPE_CHECKING, Any import joblib import numpy as np @@ -55,7 +56,7 @@ ) -def conflate_matches(matches: List[Match], doc_length: int) -> List[Candidate]: +def conflate_matches(matches: list[Match], doc_length: int) -> list[Candidate]: subj_matches = collections.defaultdict(list) for match in matches: subj_matches[match.subject_id].append(match) @@ -80,7 +81,7 @@ def generate_candidates( analyzer: Analyzer, vectorizer: CountVectorizer, index: TokenSetIndex, -) -> List[Candidate]: +) -> list[Candidate]: sentences = analyzer.tokenize_sentences(text) sent_tokens = vectorizer.transform(sentences) matches = [] @@ -102,7 +103,7 @@ def generate_candidates( def candidates_to_features( - candidates: List[Candidate], mdata: "ModelData" + candidates: list[Candidate], mdata: "ModelData" ) -> np.ndarray: """Convert a list of Candidates to a NumPy feature matrix""" @@ -150,7 +151,7 @@ def candidates_to_features(cls, candidates): class MLLMModel: """Maui-like Lexical Matching model""" - def generate_candidates(self, text: str, analyzer: Analyzer) -> List[Candidate]: + def generate_candidates(self, text: str, analyzer: Analyzer) -> list[Candidate]: return generate_candidates(text, analyzer, self._vectorizer, self._index) @property @@ -165,11 +166,11 @@ def _model_data(self) -> ModelData: idf=self._idf, ) - def _candidates_to_features(self, candidates: List[Candidate]) -> np.ndarray: + def _candidates_to_features(self, candidates: list[Candidate]) -> np.ndarray: return candidates_to_features(candidates, self._model_data) @staticmethod - def _get_label_props(params: Dict[str, Any]) -> Tuple[List[URIRef], List[URIRef]]: + def _get_label_props(params: dict[str, Any]) -> tuple[list[URIRef], list[URIRef]]: pref_label_props = [SKOS.prefLabel] if annif.util.boolean(params["use_hidden_labels"]): @@ -183,8 +184,8 @@ def _prepare_terms( self, graph: Graph, vocab: AnnifVocabulary, - params: Dict[str, Any], - ) -> Tuple[List[Term], List[int]]: + params: dict[str, Any], + ) -> tuple[list[Term], list[int]]: pref_label_props, nonpref_label_props = self._get_label_props(params) terms = [] @@ -214,8 +215,8 @@ def _prepare_train_index( self, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Any], - ) -> List[int]: + params: dict[str, Any], + ) -> list[int]: graph = vocab.as_graph() terms, subject_ids = self._prepare_terms(graph, vocab, params) self._prepare_relations(graph, vocab) @@ -240,7 +241,7 @@ def _prepare_train_index( def _prepare_train_data( self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int - ) -> Tuple[List[List[Candidate]], List[bool]]: + ) -> tuple[list[list[Candidate]], list[bool]]: # frequency of subjects (by id) in the generated candidates self._doc_freq = collections.Counter() # frequency of manually assigned subjects ("domain keyphraseness") @@ -271,8 +272,8 @@ def _prepare_train_data( return (train_x, train_y) def _calculate_idf( - self, subject_ids: List[int], doc_count: int - ) -> DefaultDict[int, float]: + self, subject_ids: list[int], doc_count: int + ) -> defaultdict[int, float]: idf = collections.defaultdict(float) for subj_id in subject_ids: idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1 @@ -280,8 +281,8 @@ def _calculate_idf( return idf def _prepare_features( - self, train_x: List[List[Candidate]], n_jobs: int - ) -> List[np.ndarray]: + self, train_x: list[list[Candidate]], n_jobs: int + ) -> list[np.ndarray]: fc_args = {"mdata": self._model_data} jobs, pool_class = annif.parallel.get_pool(n_jobs) @@ -299,9 +300,9 @@ def prepare_train( corpus: DocumentCorpus, vocab: AnnifVocabulary, analyzer: Analyzer, - params: Dict[str, Any], + params: dict[str, Any], n_jobs: int, - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> tuple[np.ndarray, np.ndarray]: # create an index from the vocabulary terms subject_ids = self._prepare_train_index(vocab, analyzer, params) @@ -316,7 +317,7 @@ def prepare_train( return (np.vstack(features), np.array(train_y)) - def _create_classifier(self, params: Dict[str, Any]) -> BaggingClassifier: + def _create_classifier(self, params: dict[str, Any]) -> BaggingClassifier: return BaggingClassifier( DecisionTreeClassifier( min_samples_leaf=int(params["min_samples_leaf"]), @@ -327,9 +328,9 @@ def _create_classifier(self, params: Dict[str, Any]) -> BaggingClassifier: def train( self, - train_x: Union[np.ndarray, List[Tuple[int, int]]], - train_y: Union[List[bool], np.ndarray], - params: Dict[str, Any], + train_x: np.ndarray | list[tuple[int, int]], + train_y: list[bool] | np.ndarray, + params: dict[str, Any], ) -> None: # fit the model on the training corpus self._classifier = self._create_classifier(params) @@ -344,19 +345,19 @@ def train( ) def _prediction_to_list( - self, scores: np.ndarray, candidates: List[Candidate] - ) -> List[Tuple[np.float64, int]]: + self, scores: np.ndarray, candidates: list[Candidate] + ) -> list[tuple[np.float64, int]]: subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)] return sorted(subj_scores, reverse=True) - def predict(self, candidates: List[Candidate]) -> List[Tuple[np.float64, int]]: + def predict(self, candidates: list[Candidate]) -> list[tuple[np.float64, int]]: if not candidates: return [] features = self._candidates_to_features(candidates) scores = self._classifier.predict_proba(features) return self._prediction_to_list(scores, candidates) - def save(self, filename: str) -> List[str]: + def save(self, filename: str) -> list[str]: return joblib.dump(self, filename) @staticmethod diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index 570ea1204..0641a31d1 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -2,7 +2,7 @@ from __future__ import annotations import collections -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING if TYPE_CHECKING: from numpy import int32, ndarray @@ -15,8 +15,8 @@ class TokenSet: def __init__( self, - tokens: Union[List[int32], List[int], ndarray], - subject_id: Optional[int] = None, + tokens: list[int32] | list[int] | ndarray, + subject_id: int | None = None, is_pref: bool = False, ) -> None: self._tokens = set(tokens) @@ -51,7 +51,7 @@ def add(self, tset: TokenSet) -> None: if tset.key is not None: self._index[tset.key].add(tset) - def _find_subj_tsets(self, tset: TokenSet) -> Dict[Optional[int], TokenSet]: + def _find_subj_tsets(self, tset: TokenSet) -> dict[int | None, TokenSet]: """return a dict (subject_id : TokenSet) of matches contained in the given TokenSet""" @@ -85,7 +85,7 @@ def _find_subj_ambiguity(self, tsets): return subj_ambiguity - def search(self, tset: TokenSet) -> List[Tuple[TokenSet, int]]: + def search(self, tset: TokenSet) -> list[tuple[TokenSet, int]]: """Return the TokenSets that are contained in the given TokenSet. The matches are returned as a list of (TokenSet, ambiguity) pairs where ambiguity is an integer indicating the number of other TokenSets diff --git a/annif/lexical/util.py b/annif/lexical/util.py index 0195ce05b..28d21a141 100644 --- a/annif/lexical/util.py +++ b/annif/lexical/util.py @@ -2,7 +2,7 @@ from __future__ import annotations import collections -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING from rdflib import URIRef from rdflib.namespace import SKOS @@ -15,8 +15,8 @@ def get_subject_labels( - graph: Graph, uri: str, properties: List[URIRef], language: str -) -> List[str]: + graph: Graph, uri: str, properties: list[URIRef], language: str +) -> list[str]: return [ str(label) for prop in properties diff --git a/annif/openapi/validation.py b/annif/openapi/validation.py index 432c7fa59..7f920b35d 100644 --- a/annif/openapi/validation.py +++ b/annif/openapi/validation.py @@ -2,7 +2,6 @@ from __future__ import annotations import logging -from typing import Dict, List, Union import jsonschema from connexion import decorators @@ -21,7 +20,7 @@ def __init__(self, *args, **kwargs) -> None: def validate_schema( self, - data: Union[List, Dict], + data: list | dict, url: str, ) -> None: """Validate the request body against the schema.""" diff --git a/annif/project.py b/annif/project.py index 408c1af30..be59145b4 100644 --- a/annif/project.py +++ b/annif/project.py @@ -3,8 +3,9 @@ import enum import os.path +from collections import defaultdict from shutil import rmtree -from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Union +from typing import TYPE_CHECKING import annif import annif.analyzer @@ -62,7 +63,7 @@ class AnnifProject(DatadirMixin): def __init__( self, project_id: str, - config: Union[Dict[str, str], SectionProxy], + config: dict[str, str] | SectionProxy, datadir: str, registry: AnnifRegistry, ) -> None: @@ -133,8 +134,8 @@ def initialize(self, parallel: bool = False) -> None: def _suggest_with_backend( self, - texts: List[str], - backend_params: Optional[DefaultDict[str, Dict]], + texts: list[str], + backend_params: defaultdict[str, dict] | None, ) -> annif.suggestion.SuggestionBatch: if backend_params is None: backend_params = {} @@ -206,7 +207,7 @@ def vocab_lang(self) -> str: def subjects(self) -> SubjectIndex: return self.vocab.subjects - def _get_info(self, key: str) -> Optional[Union[bool, datetime]]: + def _get_info(self, key: str) -> bool | datetime | None: try: be = self.backend if be is not None: @@ -216,17 +217,17 @@ def _get_info(self, key: str) -> Optional[Union[bool, datetime]]: return None @property - def is_trained(self) -> Optional[bool]: + def is_trained(self) -> bool | None: return self._get_info("is_trained") @property - def modification_time(self) -> Optional[datetime]: + def modification_time(self) -> datetime | None: return self._get_info("modification_time") def suggest_corpus( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict]] = None, + backend_params: defaultdict[str, dict] | None = None, ) -> annif.suggestion.SuggestionResults: """Suggest subjects for the given documents corpus in batches of documents.""" suggestions = ( @@ -238,8 +239,8 @@ def suggest_corpus( def suggest( self, - texts: List[str], - backend_params: Optional[DefaultDict[str, Dict]] = None, + texts: list[str], + backend_params: defaultdict[str, dict] | None = None, ) -> annif.suggestion.SuggestionBatch: """Suggest subjects for the given documents batch.""" if not self.is_trained: @@ -253,7 +254,7 @@ def suggest( def train( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict]] = None, + backend_params: defaultdict[str, dict] | None = None, jobs: int = 0, ) -> None: """train the project using documents from a metadata source""" @@ -267,7 +268,7 @@ def train( def learn( self, corpus: DocumentCorpus, - backend_params: Optional[DefaultDict[str, Dict]] = None, + backend_params: defaultdict[str, dict] | None = None, ) -> None: """further train the project using documents from a metadata source""" if backend_params is None: @@ -287,7 +288,7 @@ def hyperopt( trials: int, jobs: int, metric: str, - results_file: Optional[LazyFile], + results_file: LazyFile | None, ) -> HPRecommendation: """optimize the hyperparameters of the project using a validation corpus against a given metric""" @@ -300,7 +301,7 @@ def hyperopt( project_id=self.project_id, ) - def dump(self) -> Dict[str, Optional[Union[str, Dict, bool, datetime]]]: + def dump(self) -> dict[str, str | dict | bool | datetime | None]: """return this project as a dict""" return { "project_id": self.project_id, diff --git a/annif/registry.py b/annif/registry.py index 7e631221b..513f876f0 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -2,7 +2,7 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING from flask import current_app @@ -49,7 +49,7 @@ def _init_vars(self) -> None: self._projects[self._rid] = self._create_projects() self._vocabs[self._rid] = {} - def _create_projects(self) -> Dict: + def _create_projects(self) -> dict: # parse the configuration config = parse_config(self._projects_config_path) @@ -67,7 +67,7 @@ def _create_projects(self) -> Dict: def get_projects( self, min_access: Access = Access.private - ) -> Dict[str, AnnifProject]: + ) -> dict[str, AnnifProject]: """Return the available projects as a dict of project_id -> AnnifProject. The min_access parameter may be used to set the minimum access level required for the returned projects.""" @@ -91,8 +91,8 @@ def get_project( raise ValueError("No such project {}".format(project_id)) def get_vocab( - self, vocab_spec: str, default_language: Optional[str] - ) -> Union[Tuple[AnnifVocabulary, None], Tuple[AnnifVocabulary, str]]: + self, vocab_spec: str, default_language: str | None + ) -> tuple[AnnifVocabulary, None] | tuple[AnnifVocabulary, str]: """Return an (AnnifVocabulary, language) pair corresponding to the vocab_spec. If no language information is specified, use the given default language.""" @@ -120,7 +120,7 @@ def initialize_projects(app: LocalProxy) -> None: app.annif_registry = AnnifRegistry(projects_config_path, datadir, init_projects) -def get_projects(min_access: Access = Access.private) -> Dict[str, AnnifProject]: +def get_projects(min_access: Access = Access.private) -> dict[str, AnnifProject]: """Return the available projects as a dict of project_id -> AnnifProject. The min_access parameter may be used to set the minimum access level required for the returned projects.""" @@ -140,7 +140,7 @@ def get_project(project_id: str, min_access: Access = Access.private) -> AnnifPr raise ValueError(f"No such project '{project_id}'") -def get_vocabs(min_access: Access = Access.private) -> Dict[str, AnnifVocabulary]: +def get_vocabs(min_access: Access = Access.private) -> dict[str, AnnifVocabulary]: """Return the available vocabularies as a dict of vocab_id -> AnnifVocabulary. The min_access parameter may be used to set the minimum access level required for the returned vocabularies.""" diff --git a/annif/rest.py b/annif/rest.py index f7af6dc67..669aa690f 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -3,7 +3,7 @@ from __future__ import annotations import importlib -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any import connexion @@ -33,7 +33,7 @@ def project_not_found_error(project_id: str) -> ConnexionResponse: def server_error( - err: Union[ConfigurationException, NotSupportedException] + err: ConfigurationException | NotSupportedException, ) -> ConnexionResponse: """return a Connexion error object when there is a server error (project or backend problem)""" @@ -43,7 +43,7 @@ def server_error( ) -def show_info() -> Dict[str, str]: +def show_info() -> dict[str, str]: """return version of annif and a title for the api according to OpenAPI spec""" return {"title": "Annif REST API", "version": importlib.metadata.version("annif")} @@ -59,9 +59,7 @@ def language_not_supported_error(lang: str) -> ConnexionResponse: ) -def list_projects() -> ( - Dict[str, List[Dict[str, Optional[Union[str, Dict, bool, datetime]]]]] -): +def list_projects() -> dict[str, list[dict[str, str | dict | bool | datetime | None]]]: """return a dict with projects formatted according to OpenAPI spec""" return { @@ -74,7 +72,7 @@ def list_projects() -> ( def show_project( project_id: str, -) -> Union[Dict, ConnexionResponse]: +) -> dict | ConnexionResponse: """return a single project formatted according to OpenAPI spec""" try: @@ -86,7 +84,7 @@ def show_project( def _suggestion_to_dict( suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str -) -> Dict[str, Optional[Union[str, float]]]: +) -> dict[str, str | float | None]: subject = subject_index[suggestion.subject_id] return { "uri": subject.uri, @@ -98,19 +96,14 @@ def _suggestion_to_dict( def _hit_sets_to_list( hit_sets: SuggestionResults, subjects: SubjectIndex, lang: str -) -> List[Dict[str, List]]: +) -> list[dict[str, list]]: return [ {"results": [_suggestion_to_dict(hit, subjects, lang) for hit in hits]} for hits in hit_sets ] -def _is_error( - result: Union[ - List[Dict[str, List]], - ConnexionResponse, - ] -) -> bool: +def _is_error(result: list[dict[str, list]] | ConnexionResponse) -> bool: return ( isinstance(result, connexion.lifecycle.ConnexionResponse) and result.status_code >= 400 @@ -118,8 +111,8 @@ def _is_error( def suggest( - project_id: str, body: Dict[str, Union[float, str]] -) -> Union[Dict[str, List], ConnexionResponse]: + project_id: str, body: dict[str, float | str] +) -> dict[str, list] | ConnexionResponse: """suggest subjects for the given text and return a dict with results formatted according to OpenAPI spec""" @@ -136,9 +129,9 @@ def suggest( def suggest_batch( project_id: str, - body: Dict[str, List], + body: dict[str, list], **query_parameters, -) -> Union[List[Dict[str, Any]], ConnexionResponse]: +) -> list[dict[str, Any]] | ConnexionResponse: """suggest subjects for the given documents and return a list of dicts with results formatted according to OpenAPI spec""" @@ -154,9 +147,9 @@ def suggest_batch( def _suggest( project_id: str, - documents: List[Dict[str, str]], - parameters: Dict[str, Union[float, str]], -) -> Union[List[Dict[str, List]], ConnexionResponse]: + documents: list[dict[str, str]], + parameters: dict[str, float | str], +) -> list[dict[str, list]] | ConnexionResponse: corpus = _documents_to_corpus(documents, subject_index=None) try: project = annif.registry.get_project(project_id, min_access=Access.hidden) @@ -183,8 +176,8 @@ def _suggest( def _documents_to_corpus( - documents: List[Dict[str, Any]], - subject_index: Optional[SubjectIndex], + documents: list[dict[str, Any]], + subject_index: SubjectIndex | None, ) -> annif.corpus.document.DocumentList: if subject_index is not None: corpus = [ @@ -206,8 +199,8 @@ def _documents_to_corpus( def learn( project_id: str, - body: List[Dict[str, Any]], -) -> Union[ConnexionResponse, Tuple[None, int]]: + body: list[dict[str, Any]], +) -> ConnexionResponse | tuple[None, int]: """learn from documents and return an empty 204 response if succesful""" try: diff --git a/annif/suggestion.py b/annif/suggestion.py index eb80d8888..03ef8bca3 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -3,7 +3,8 @@ import collections import itertools -from typing import TYPE_CHECKING, Iterator, List, Optional +from collections.abc import Iterator +from typing import TYPE_CHECKING import numpy as np from scipy.sparse import csr_array @@ -24,7 +25,7 @@ def vector_to_suggestions(vector: np.ndarray, limit: int) -> Iterator: def filter_suggestion( preds: csr_array, - limit: Optional[int] = None, + limit: int | None = None, threshold: float = 0.0, ) -> csr_array: """filter a 2D sparse suggestion array (csr_array), retaining only the @@ -85,9 +86,9 @@ def __init__(self, array: csr_array) -> None: @classmethod def from_sequence( cls, - suggestion_results: List[List[SubjectSuggestion]], + suggestion_results: list[list[SubjectSuggestion]], subject_index: SubjectIndex, - limit: Optional[int] = None, + limit: int | None = None, ) -> SuggestionBatch: """Create a new SuggestionBatch from a sequence where each item is a sequence of SubjectSuggestion objects.""" @@ -111,7 +112,7 @@ def from_sequence( @classmethod def from_averaged( - cls, batches: List[SuggestionBatch], weights: List[float] + cls, batches: list[SuggestionBatch], weights: list[float] ) -> SuggestionBatch: """Create a new SuggestionBatch where the subject scores are the weighted average of scores in several SuggestionBatches""" @@ -122,7 +123,7 @@ def from_averaged( return SuggestionBatch(avg_array) def filter( - self, limit: Optional[int] = None, threshold: float = 0.0 + self, limit: int | None = None, threshold: float = 0.0 ) -> SuggestionBatch: """Return a subset of the hits, filtered by the given limit and score threshold, as another SuggestionBatch object.""" @@ -141,14 +142,14 @@ def __len__(self) -> int: class SuggestionResults: """Subject suggestions for a potentially very large number of documents.""" - def __init__(self, batches: List[SuggestionBatch]) -> None: + def __init__(self, batches: list[SuggestionBatch]) -> None: """Initialize a new SuggestionResults from an iterable that provides SuggestionBatch objects.""" self.batches = batches def filter( - self, limit: Optional[int] = None, threshold: float = 0.0 + self, limit: int | None = None, threshold: float = 0.0 ) -> SuggestionResults: """Return a view of these suggestions, filtered by the given limit and/or threshold, as another SuggestionResults object.""" diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index e88dbe75e..716d874a2 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -2,7 +2,7 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING import annif from annif.exception import ConfigurationException @@ -17,7 +17,7 @@ def parse_specs( transform_specs: str, -) -> List[Tuple[str, List, Dict]]: +) -> list[tuple[str, list, dict]]: """Parse a transformation specification into a list of tuples, e.g. 'transf_1(x),transf_2(y=42),transf_3' is parsed to [(transf_1, [x], {}), (transf_2, [], {y: 42}), (transf_3, [], {})].""" @@ -35,9 +35,7 @@ def parse_specs( return parsed -def get_transform( - transform_specs: str, project: Optional[AnnifProject] -) -> TransformChain: +def get_transform(transform_specs: str, project: AnnifProject | None) -> TransformChain: transform_defs = parse_specs(transform_specs) transform_classes = [] args = [] diff --git a/annif/transform/inputlimiter.py b/annif/transform/inputlimiter.py index 14a233350..229766864 100644 --- a/annif/transform/inputlimiter.py +++ b/annif/transform/inputlimiter.py @@ -2,7 +2,7 @@ given character length.""" from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from annif.exception import ConfigurationException @@ -15,7 +15,7 @@ class InputLimiter(transform.BaseTransform): name = "limit" - def __init__(self, project: Optional[AnnifProject], input_limit: str) -> None: + def __init__(self, project: AnnifProject | None, input_limit: str) -> None: super().__init__(project) self.input_limit = int(input_limit) self._validate_value(self.input_limit) diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index 018ea3996..7508550fc 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -2,7 +2,7 @@ different from the language of the project.""" from __future__ import annotations -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING from simplemma.langdetect import in_target_language @@ -22,8 +22,8 @@ class LangFilter(transform.BaseTransform): def __init__( self, project: AnnifProject, - text_min_length: Union[int, str] = 500, - sentence_min_length: Union[int, str] = 50, + text_min_length: int | str = 500, + sentence_min_length: int | str = 50, min_ratio: float = 0.5, ) -> None: super().__init__(project) diff --git a/annif/transform/transform.py b/annif/transform/transform.py index a4ba227d7..1d185da3b 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -2,7 +2,7 @@ from __future__ import annotations import abc -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Type from annif.corpus import TransformingDocumentCorpus from annif.exception import ConfigurationException @@ -18,7 +18,7 @@ class BaseTransform(metaclass=abc.ABCMeta): name = None - def __init__(self, project: Optional[AnnifProject]) -> None: + def __init__(self, project: AnnifProject | None) -> None: self.project = project @abc.abstractmethod @@ -42,18 +42,18 @@ class TransformChain: def __init__( self, - transform_classes: List[Type[BaseTransform]], - args: List[Tuple[List, Dict]], - project: Optional[AnnifProject], + transform_classes: list[Type[BaseTransform]], + args: list[tuple[list, dict]], + project: AnnifProject | None, ) -> None: self.project = project self.transforms = self._init_transforms(transform_classes, args) def _init_transforms( self, - transform_classes: List[Type[BaseTransform]], - args: List[Tuple[List, Dict]], - ) -> List[Type[BaseTransform]]: + transform_classes: list[Type[BaseTransform]], + args: list[tuple[list, dict]], + ) -> list[Type[BaseTransform]]: transforms = [] for trans, (posargs, kwargs) in zip(transform_classes, args): try: diff --git a/annif/vocab.py b/annif/vocab.py index b33550920..333fa0d69 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -2,7 +2,7 @@ from __future__ import annotations import os.path -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING import annif import annif.corpus @@ -107,7 +107,7 @@ def __len__(self) -> int: return len(self.subjects) @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: return self.subjects.languages def load_vocabulary( From 5f84a56998c9cde663caa420acb23e001b692ce9 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 30 May 2023 11:27:37 +0300 Subject: [PATCH 20/28] Manually annotate annif/corpus/parallel.py --- annif/parallel.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/annif/parallel.py b/annif/parallel.py index 3162a47c5..c6b293f8e 100644 --- a/annif/parallel.py +++ b/annif/parallel.py @@ -1,8 +1,19 @@ """Parallel processing functionality for Annif""" - +from __future__ import annotations import multiprocessing import multiprocessing.dummy +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections import defaultdict + from collections.abc import Iterator + from typing import Callable + + from annif.corpus import Document, SubjectSet + from annif.registry import AnnifRegistry + from annif.suggestion import SuggestionBatch, SuggestionResult + # Start method for processes created by the multiprocessing module. # A value of None means using the platform-specific default. @@ -22,7 +33,7 @@ class BaseWorker: args = None @classmethod - def init(cls, args): + def init(cls, args) -> None: cls.args = args # pragma: no cover @@ -31,14 +42,21 @@ class ProjectSuggestMap: provide a mapping method that converts Document objects to suggestions. Intended to be used with the multiprocessing module.""" - def __init__(self, registry, project_ids, backend_params, limit, threshold): + def __init__( + self, + registry: AnnifRegistry, + project_ids: list[str], + backend_params: defaultdict[str, Any] | None, + limit: int | None, + threshold: float, + ) -> None: self.registry = registry self.project_ids = project_ids self.backend_params = backend_params self.limit = limit self.threshold = threshold - def suggest(self, doc): + def suggest(self, doc: Document) -> tuple[dict[str, SuggestionResult], SubjectSet]: filtered_hits = {} for project_id in self.project_ids: project = self.registry.get_project(project_id) @@ -46,7 +64,9 @@ def suggest(self, doc): filtered_hits[project_id] = batch.filter(self.limit, self.threshold)[0] return (filtered_hits, doc.subject_set) - def suggest_batch(self, batch): + def suggest_batch( + self, batch + ) -> tuple[dict[str, SuggestionBatch], Iterator[SubjectSet]]: filtered_hit_sets = {} texts, subject_sets = zip(*[(doc.text, doc.subject_set) for doc in batch]) @@ -57,19 +77,19 @@ def suggest_batch(self, batch): return (filtered_hit_sets, subject_sets) -def get_pool(n_jobs): - """return a suitable multiprocessing pool class, and the correct jobs - argument for its constructor, for the given amount of parallel jobs""" +def get_pool(n_jobs: int) -> tuple[int | None, Callable]: + """return a suitable constructor for multiprocessing pool class, and the correct + jobs argument for it, for the given amount of parallel jobs""" ctx = multiprocessing.get_context(MP_START_METHOD) if n_jobs < 1: n_jobs = None - pool_class = ctx.Pool + pool_constructor: Callable = ctx.Pool elif n_jobs == 1: # use the dummy wrapper around threading to avoid subprocess overhead - pool_class = multiprocessing.dummy.Pool + pool_constructor = multiprocessing.dummy.Pool else: - pool_class = ctx.Pool + pool_constructor = ctx.Pool - return n_jobs, pool_class + return n_jobs, pool_constructor From b9cfacf8f23d3aee9e2123c3fb677fffe13d0398 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 30 May 2023 13:00:07 +0300 Subject: [PATCH 21/28] Manually annotate annif/util.py --- annif/util.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/annif/util.py b/annif/util.py index a664027f5..803aa8aea 100644 --- a/annif/util.py +++ b/annif/util.py @@ -1,10 +1,12 @@ """Utility functions for Annif""" +from __future__ import annotations import glob import logging import os import os.path import tempfile +from typing import Any, Callable from annif import logger @@ -12,11 +14,11 @@ class DuplicateFilter(logging.Filter): """Filter out log messages that have already been displayed.""" - def __init__(self): + def __init__(self) -> None: super().__init__() self.logged = set() - def filter(self, record): + def filter(self, record: logging.LogRecord) -> bool: current_log = hash((record.module, record.levelno, record.msg, record.args)) if current_log not in self.logged: self.logged.add(current_log) @@ -24,7 +26,9 @@ def filter(self, record): return False -def atomic_save(obj, dirname, filename, method=None): +def atomic_save( + obj: Any, dirname: str, filename: str, method: Callable | None = None +) -> None: """Save the given object (which must have a .save() method, unless the method parameter is given) into the given directory with the given filename, using a temporary file and renaming the temporary file to the @@ -44,14 +48,14 @@ def atomic_save(obj, dirname, filename, method=None): os.rename(fn, newname) -def cleanup_uri(uri): +def cleanup_uri(uri: str) -> str: """remove angle brackets from a URI, if any""" if uri.startswith("<") and uri.endswith(">"): return uri[1:-1] return uri -def parse_sources(sourcedef): +def parse_sources(sourcedef: str) -> list[tuple[str, float]]: """parse a source definition such as 'src1:1.0,src2' into a sequence of tuples (src_id, weight)""" @@ -69,7 +73,7 @@ def parse_sources(sourcedef): return [(srcid, weight / totalweight) for srcid, weight in sources] -def parse_args(param_string): +def parse_args(param_string: str) -> tuple[list, dict]: """Parse a string of comma separated arguments such as '42,43,key=abc' into a list of positional args [42, 43] and a dict of keyword args {key: abc}""" @@ -87,7 +91,7 @@ def parse_args(param_string): return posargs, kwargs -def boolean(val): +def boolean(val: Any) -> bool: """Convert the given value to a boolean True/False value, if it isn't already. True values are '1', 'yes', 'true', and 'on' (case insensitive), everything else is False.""" @@ -95,7 +99,7 @@ def boolean(val): return str(val).lower() in ("1", "yes", "true", "on") -def identity(x): +def identity(x: Any) -> Any: """Identity function: return the given argument unchanged""" return x From c49fee924687d15c38abf0428e6127e4ea8a981e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 30 May 2023 15:50:38 +0300 Subject: [PATCH 22/28] Fix easily fixable errors noted by Mypy --- annif/backend/backend.py | 9 +++++++-- annif/backend/ensemble.py | 4 +++- annif/backend/hyperopt.py | 2 +- annif/backend/mixins.py | 4 ++-- annif/backend/mllm.py | 2 +- annif/backend/yake.py | 11 +++++------ annif/cli_util.py | 4 ++-- annif/corpus/skos.py | 2 +- annif/corpus/subject.py | 10 +++++----- annif/eval.py | 10 +++++----- annif/project.py | 2 +- annif/registry.py | 8 ++------ annif/rest.py | 7 +++---- annif/suggestion.py | 6 +++--- annif/transform/langfilter.py | 2 +- annif/transform/transform.py | 2 +- 16 files changed, 43 insertions(+), 42 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 6742493b7..f35b0a312 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -11,6 +11,8 @@ from annif.suggestion import SuggestionBatch if TYPE_CHECKING: + from configparser import SectionProxy + from annif.corpus.document import DocumentCorpus from annif.project import AnnifProject @@ -24,7 +26,10 @@ class AnnifBackend(metaclass=abc.ABCMeta): DEFAULT_PARAMETERS = {"limit": 100} def __init__( - self, backend_id: str, config_params: dict[str, Any], project: AnnifProject + self, + backend_id: str, + config_params: dict[str, Any] | SectionProxy, + project: AnnifProject, ) -> None: """Initialize backend with specific parameters. The parameters are a dict. Keys and values depend on the specific @@ -49,7 +54,7 @@ def is_trained(self) -> bool: return bool(glob(os.path.join(self.datadir, "*"))) @property - def modification_time(self) -> datetime.datetime | None: + def modification_time(self) -> datetime | None: mtimes = [ datetime.utcfromtimestamp(os.path.getmtime(p)) for p in glob(os.path.join(self.datadir, "*")) diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index 9b22d915e..97cbc73ac 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -12,6 +12,8 @@ from . import backend, hyperopt if TYPE_CHECKING: + from datetime import datetime + from optuna.study.study import Study from optuna.trial._trial import Trial @@ -150,7 +152,7 @@ def is_trained(self) -> bool: return all(sources_trained) @property - def modification_time(self) -> None: + def modification_time(self) -> datetime | None: mtimes = self._get_sources_attribute("modification_time") return max(filter(None, mtimes), default=None) diff --git a/annif/backend/hyperopt.py b/annif/backend/hyperopt.py index 7e1506c03..2c2e7422c 100644 --- a/annif/backend/hyperopt.py +++ b/annif/backend/hyperopt.py @@ -116,7 +116,7 @@ class AnnifHyperoptBackend(AnnifBackend): optimization""" @abc.abstractmethod - def get_hp_optimizer(self, corpus: DocumentCorpus): + def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str): """Get a HyperparameterOptimizer object that can look for optimal hyperparameter combinations for the given corpus, measured using the given metric""" diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 942f9cad0..2fa7d9eba 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -3,7 +3,7 @@ import abc import os.path -from collections.abc import Iterator +from collections.abc import Iterable from typing import TYPE_CHECKING, Any import joblib @@ -71,7 +71,7 @@ def initialize_vectorizer(self) -> None: ) def create_vectorizer( - self, input: Iterator[str], params: dict[str, Any] = {} + self, input: Iterable[str], params: dict[str, Any] = {} ) -> csr_matrix: self.info("creating vectorizer") self.vectorizer = TfidfVectorizer(**params) diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 2d4337c6e..7315dcc71 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -40,7 +40,7 @@ def _prepare(self, n_jobs: int = 1) -> None: self._candidates.append(candidates) self._gold_subjects.append(doc.subject_set) - def _objective(self, trial: Trial) -> np.float: + def _objective(self, trial: Trial) -> float: params = { "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 30), "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 100, 2000), diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 5d853f4c5..1e6adfdd5 100644 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -19,7 +19,6 @@ from . import backend if TYPE_CHECKING: - from numpy import float64 from rdflib.term import URIRef from annif.corpus.document import DocumentCorpus @@ -141,8 +140,8 @@ def _suggest(self, text: str, params: dict[str, Any]) -> list[SubjectSuggestion] return subject_suggestions def _keyphrases2suggestions( - self, keyphrases: list[tuple[str, float64]] - ) -> list[tuple[str, float64]]: + self, keyphrases: list[tuple[str, float]] + ) -> list[tuple[str, float]]: suggestions = [] not_matched = [] for kp, score in keyphrases: @@ -169,13 +168,13 @@ def _keyphrase2uris(self, keyphrase: str) -> set[str]: keyphrase = self._sort_phrase(keyphrase) return self._index.get(keyphrase, []) - def _transform_score(self, score: float64) -> float64: + def _transform_score(self, score: float) -> float: score = max(score, 0) return 1.0 / (score + 1) def _combine_suggestions( - self, suggestions: list[tuple[str, float], tuple[str, float64]] - ) -> list[tuple[str, float], tuple[str, float64]]: + self, suggestions: list[tuple[str, float]] + ) -> list[tuple[str, float]]: combined_suggestions = {} for uri, score in suggestions: if uri not in combined_suggestions: diff --git a/annif/cli_util.py b/annif/cli_util.py index 4d636abc6..bbfa96df4 100644 --- a/annif/cli_util.py +++ b/annif/cli_util.py @@ -128,7 +128,7 @@ def format_datetime(dt: datetime | None) -> str: def open_documents( - paths: tuple[str, ...] | tuple[()], + paths: tuple[str, ...], subject_index: SubjectIndex, vocab_lang: str, docs_limit: int | None, @@ -232,7 +232,7 @@ def generate_filter_params(filter_batch_max_limit: int) -> list[tuple[int, float def _get_completion_choices( param: Argument, -) -> dict[str, AnnifVocabulary | AnnifProject]: +) -> dict[str, AnnifVocabulary] | dict[str, AnnifProject] | list: if param.name == "project_id": return annif.registry.get_projects() elif param.name == "vocab_id": diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 97df7d700..cd31cca80 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -113,7 +113,7 @@ def get_concept_labels( self, concept: URIRef, label_types: Sequence[URIRef], - ) -> defaultdict[str, list[str]] | defaultdict[None, list[str]]: + ) -> defaultdict[str | None, list[str]]: """return all the labels of the given concept with the given label properties as a dict-like object where the keys are language codes and the values are lists of labels in that language""" diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 045867990..082bf9ad4 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -3,7 +3,7 @@ import csv import os.path -from collections.abc import Iterator +from collections.abc import Generator, Iterator from typing import TYPE_CHECKING, Any import annif @@ -42,7 +42,7 @@ def languages(self) -> list[str]: return [self.language] @property - def subjects(self) -> None: + def subjects(self) -> Generator: with open(self.path, encoding="utf-8-sig") as subjfile: for line in subjfile: yield from self._parse_line(line) @@ -92,7 +92,7 @@ def languages(self) -> list[str]: ] @property - def subjects(self) -> None: + def subjects(self) -> Generator: with open(self.path, encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile) for row in reader: @@ -131,7 +131,7 @@ def __len__(self) -> int: return len(self._subjects) @property - def languages(self) -> list[str]: + def languages(self) -> list[str] | None: return self._languages def __getitem__(self, subject_id: int | np.int32) -> Subject: @@ -239,7 +239,7 @@ def __getitem__(self, idx: int) -> int: def __bool__(self) -> bool: return bool(self._subject_ids) - def __eq__(self, other: SubjectSet) -> bool: + def __eq__(self, other: Any) -> bool: if isinstance(other, SubjectSet): return self._subject_ids == other._subject_ids diff --git a/annif/eval.py b/annif/eval.py index d20b4b67b..4f788bed1 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from collections.abc import Iterator, Sequence +from collections.abc import Iterable, Iterator, Sequence from typing import TYPE_CHECKING import numpy as np @@ -110,8 +110,8 @@ def _evaluate_samples( self, y_true: csr_array, y_pred: csr_array, - metrics: Sequence[str] = [], - ) -> dict[str, np.float64 | float | int]: + metrics: Iterable[str] = [], + ) -> dict[str, float]: y_pred_binary = y_pred > 0.0 # define the available metrics as lazy lambda functions @@ -245,10 +245,10 @@ def output_result_per_subject( def results( self, - metrics: Sequence[str] = [], + metrics: Iterable[str] = [], results_file: LazyFile | TextIOWrapper | None = None, language: str | None = None, - ) -> dict[str, np.float64 | float]: + ) -> dict[str, float]: """evaluate a set of selected subjects against a gold standard using different metrics. If metrics is empty, use all available metrics. If results_file (file object) given, write results per subject to it diff --git a/annif/project.py b/annif/project.py index be59145b4..5dbebf9d4 100644 --- a/annif/project.py +++ b/annif/project.py @@ -162,7 +162,7 @@ def transform(self) -> TransformChain: return self._transform @property - def backend(self) -> AnnifBackend: + def backend(self) -> AnnifBackend | None: if self._backend is None: if "backend" not in self.config: raise ConfigurationException( diff --git a/annif/registry.py b/annif/registry.py index 513f876f0..81bd541ef 100644 --- a/annif/registry.py +++ b/annif/registry.py @@ -2,9 +2,8 @@ from __future__ import annotations import re -from typing import TYPE_CHECKING -from flask import current_app +from flask import Flask, current_app import annif from annif.config import parse_config @@ -13,9 +12,6 @@ from annif.util import parse_args from annif.vocab import AnnifVocabulary -if TYPE_CHECKING: - from werkzeug.local import LocalProxy - logger = annif.logger @@ -113,7 +109,7 @@ def get_vocab( return self._vocabs[self._rid][vocab_key], language -def initialize_projects(app: LocalProxy) -> None: +def initialize_projects(app: Flask) -> None: projects_config_path = app.config["PROJECTS_CONFIG_PATH"] datadir = app.config["DATADIR"] init_projects = app.config["INITIALIZE_PROJECTS"] diff --git a/annif/rest.py b/annif/rest.py index 669aa690f..f848117c8 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -18,7 +18,6 @@ from connexion.lifecycle import ConnexionResponse from annif.corpus.subject import SubjectIndex - from annif.exception import ConfigurationException, NotSupportedException from annif.suggestion import SubjectSuggestion, SuggestionResults @@ -33,7 +32,7 @@ def project_not_found_error(project_id: str) -> ConnexionResponse: def server_error( - err: ConfigurationException | NotSupportedException, + err: AnnifException, ) -> ConnexionResponse: """return a Connexion error object when there is a server error (project or backend problem)""" @@ -111,7 +110,7 @@ def _is_error(result: list[dict[str, list]] | ConnexionResponse) -> bool: def suggest( - project_id: str, body: dict[str, float | str] + project_id: str, body: dict[str, Any] ) -> dict[str, list] | ConnexionResponse: """suggest subjects for the given text and return a dict with results formatted according to OpenAPI spec""" @@ -148,7 +147,7 @@ def suggest_batch( def _suggest( project_id: str, documents: list[dict[str, str]], - parameters: dict[str, float | str], + parameters: dict[str, Any], ) -> list[dict[str, list]] | ConnexionResponse: corpus = _documents_to_corpus(documents, subject_index=None) try: diff --git a/annif/suggestion.py b/annif/suggestion.py index 03ef8bca3..187ba5f7e 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -3,7 +3,7 @@ import collections import itertools -from collections.abc import Iterator +from collections.abc import Iterable, Iterator, Sequence from typing import TYPE_CHECKING import numpy as np @@ -86,7 +86,7 @@ def __init__(self, array: csr_array) -> None: @classmethod def from_sequence( cls, - suggestion_results: list[list[SubjectSuggestion]], + suggestion_results: Sequence[Iterable[SubjectSuggestion]], subject_index: SubjectIndex, limit: int | None = None, ) -> SuggestionBatch: @@ -142,7 +142,7 @@ def __len__(self) -> int: class SuggestionResults: """Subject suggestions for a potentially very large number of documents.""" - def __init__(self, batches: list[SuggestionBatch]) -> None: + def __init__(self, batches: Iterable[SuggestionBatch]) -> None: """Initialize a new SuggestionResults from an iterable that provides SuggestionBatch objects.""" diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index 7508550fc..6794eb370 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -40,7 +40,7 @@ def transform_fn(self, text: str) -> str: if len(sent) < self.sentence_min_length: retained_sentences.append(sent) continue - proportion = in_target_language(sent, lang=self.project.language) + proportion = in_target_language(sent, lang=(self.project.language,)) if proportion >= self.min_ratio: retained_sentences.append(sent) return " ".join(retained_sentences) diff --git a/annif/transform/transform.py b/annif/transform/transform.py index 1d185da3b..db71fef37 100644 --- a/annif/transform/transform.py +++ b/annif/transform/transform.py @@ -53,7 +53,7 @@ def _init_transforms( self, transform_classes: list[Type[BaseTransform]], args: list[tuple[list, dict]], - ) -> list[Type[BaseTransform]]: + ) -> list[BaseTransform]: transforms = [] for trans, (posargs, kwargs) in zip(transform_classes, args): try: From 8092605c4237d4f3147af027723b1067afd19ea6 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 1 Jun 2023 16:44:01 +0300 Subject: [PATCH 23/28] Exclude TYPE_CHECKING blocks from test coverage --- setup.cfg | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index cffe59417..bf3f116d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ current_version = 1.0.0-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} @@ -13,7 +13,7 @@ serialize = [bumpversion:part:release] optional_value = prod -values = +values = dev prod @@ -23,3 +23,7 @@ test = pytest [flake8] max-line-length = 88 ignore = E203 W503 + +[coverage:report] +exclude_also = + if TYPE_CHECKING: From 7ec0b7387d566c5a0802eb0954467cbbe502e966 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:36:50 +0300 Subject: [PATCH 24/28] Narrow down TokenSet tokens type to np.ndarray only --- annif/lexical/tokenset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/lexical/tokenset.py b/annif/lexical/tokenset.py index 0641a31d1..07c15705d 100644 --- a/annif/lexical/tokenset.py +++ b/annif/lexical/tokenset.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from numpy import int32, ndarray + from numpy import ndarray class TokenSet: @@ -15,7 +15,7 @@ class TokenSet: def __init__( self, - tokens: list[int32] | list[int] | ndarray, + tokens: ndarray, subject_id: int | None = None, is_pref: bool = False, ) -> None: From fd19c67845b8b778c150adc432495f2ca2f94252 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:38:46 +0300 Subject: [PATCH 25/28] Use int instead of int | np.int32 --- annif/corpus/subject.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 082bf9ad4..f507fc6af 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -134,7 +134,7 @@ def __len__(self) -> int: def languages(self) -> list[str] | None: return self._languages - def __getitem__(self, subject_id: int | np.int32) -> Subject: + def __getitem__(self, subject_id: int) -> Subject: return self._subjects[subject_id] def append(self, subject: Subject) -> None: From 486b48f31bd6e38e9d7059d5d4a2378e7a876ee8 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:40:10 +0300 Subject: [PATCH 26/28] Move imports for type typechecking only to TYPE_CHECKING blocks --- annif/backend/ensemble.py | 2 +- annif/backend/mixins.py | 3 ++- annif/backend/mllm.py | 5 +++-- annif/backend/tfidf.py | 3 ++- annif/corpus/document.py | 3 ++- annif/corpus/skos.py | 6 +++--- annif/corpus/subject.py | 3 ++- annif/corpus/types.py | 3 +-- annif/eval.py | 2 +- annif/lexical/mllm.py | 3 ++- annif/project.py | 2 +- annif/suggestion.py | 3 ++- 12 files changed, 22 insertions(+), 16 deletions(-) diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py index 97cbc73ac..6f7f2eb04 100644 --- a/annif/backend/ensemble.py +++ b/annif/backend/ensemble.py @@ -15,7 +15,7 @@ from datetime import datetime from optuna.study.study import Study - from optuna.trial._trial import Trial + from optuna.trial import Trial from annif.backend.hyperopt import HPRecommendation from annif.corpus.document import DocumentCorpus diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index 2fa7d9eba..066d5d862 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -3,7 +3,6 @@ import abc import os.path -from collections.abc import Iterable from typing import TYPE_CHECKING, Any import joblib @@ -13,6 +12,8 @@ from annif.exception import NotInitializedException if TYPE_CHECKING: + from collections.abc import Iterable + from scipy.sparse._csr import csr_matrix from annif.suggestion import SubjectSuggestion diff --git a/annif/backend/mllm.py b/annif/backend/mllm.py index 7315dcc71..f73bf8324 100644 --- a/annif/backend/mllm.py +++ b/annif/backend/mllm.py @@ -2,7 +2,6 @@ from __future__ import annotations import os.path -from collections.abc import Iterator from typing import TYPE_CHECKING, Any import joblib @@ -17,8 +16,10 @@ from . import backend, hyperopt if TYPE_CHECKING: + from collections.abc import Iterator + from optuna.study.study import Study - from optuna.trial._trial import Trial + from optuna.trial import Trial from annif.backend.hyperopt import HPRecommendation from annif.corpus.document import DocumentCorpus diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py index bf2f6c40d..1cca639ca 100644 --- a/annif/backend/tfidf.py +++ b/annif/backend/tfidf.py @@ -4,7 +4,6 @@ import os.path import tempfile -from collections.abc import Iterator from typing import TYPE_CHECKING, Any import gensim.similarities @@ -17,6 +16,8 @@ from . import backend, mixins if TYPE_CHECKING: + from collections.abc import Iterator + from scipy.sparse._csr import csr_matrix from annif.corpus.document import DocumentCorpus diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 78ea838d8..09a80a309 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -5,7 +5,6 @@ import gzip import os.path import re -from collections.abc import Iterator from itertools import islice from typing import TYPE_CHECKING @@ -15,6 +14,8 @@ from .types import Document, DocumentCorpus if TYPE_CHECKING: + from collections.abc import Iterator + from annif.corpus.subject import SubjectIndex logger = annif.logger diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index cd31cca80..462a35241 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -4,8 +4,6 @@ import collections import os.path import shutil -from collections import defaultdict -from collections.abc import Iterator, Sequence from typing import TYPE_CHECKING import rdflib @@ -17,6 +15,8 @@ from .types import Subject, SubjectCorpus if TYPE_CHECKING: + from collections.abc import Iterator, Sequence + from rdflib.term import URIRef @@ -113,7 +113,7 @@ def get_concept_labels( self, concept: URIRef, label_types: Sequence[URIRef], - ) -> defaultdict[str | None, list[str]]: + ) -> collections.defaultdict[str | None, list[str]]: """return all the labels of the given concept with the given label properties as a dict-like object where the keys are language codes and the values are lists of labels in that language""" diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index f507fc6af..a9ee06397 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -3,7 +3,6 @@ import csv import os.path -from collections.abc import Generator, Iterator from typing import TYPE_CHECKING, Any import annif @@ -13,6 +12,8 @@ from .types import Subject, SubjectCorpus if TYPE_CHECKING: + from collections.abc import Generator, Iterator + import numpy as np logger = annif.logger.getChild("subject") diff --git a/annif/corpus/types.py b/annif/corpus/types.py index de3c20db9..e6cd4b252 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -3,7 +3,6 @@ import abc import collections -from collections.abc import Iterator from itertools import islice Document = collections.namedtuple("Document", "text subject_set") @@ -21,7 +20,7 @@ def documents(self): pass # pragma: no cover @property - def doc_batches(self) -> Iterator[list[Document]]: + def doc_batches(self) -> collections.abc.Iterator[list[Document]]: """Iterate through the document corpus in batches, yielding lists of Document objects.""" it = iter(self.documents) diff --git a/annif/eval.py b/annif/eval.py index 4f788bed1..5ec5bd17a 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -2,7 +2,6 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Iterator, Sequence from typing import TYPE_CHECKING import numpy as np @@ -13,6 +12,7 @@ from annif.suggestion import SuggestionBatch, filter_suggestion if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Sequence from io import TextIOWrapper from click.utils import LazyFile diff --git a/annif/lexical/mllm.py b/annif/lexical/mllm.py index 86f87e698..37564a76d 100644 --- a/annif/lexical/mllm.py +++ b/annif/lexical/mllm.py @@ -3,7 +3,6 @@ import collections import math -from collections import defaultdict from enum import IntEnum from statistics import mean from typing import TYPE_CHECKING, Any @@ -26,6 +25,8 @@ ) if TYPE_CHECKING: + from collections import defaultdict + from rdflib.graph import Graph from rdflib.term import URIRef diff --git a/annif/project.py b/annif/project.py index 5dbebf9d4..cb2b069bc 100644 --- a/annif/project.py +++ b/annif/project.py @@ -3,7 +3,6 @@ import enum import os.path -from collections import defaultdict from shutil import rmtree from typing import TYPE_CHECKING @@ -21,6 +20,7 @@ ) if TYPE_CHECKING: + from collections import defaultdict from configparser import SectionProxy from datetime import datetime diff --git a/annif/suggestion.py b/annif/suggestion.py index 187ba5f7e..ddf3ec2e5 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -3,13 +3,14 @@ import collections import itertools -from collections.abc import Iterable, Iterator, Sequence from typing import TYPE_CHECKING import numpy as np from scipy.sparse import csr_array if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Sequence + from annif.corpus.subject import SubjectIndex SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score") From 6e921238bbf3ae725e9f4f15fca45eaa92817475 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:39:27 +0300 Subject: [PATCH 27/28] Restore accidentally removed annif.suggestion import --- annif/project.py | 1 + 1 file changed, 1 insertion(+) diff --git a/annif/project.py b/annif/project.py index cb2b069bc..83f7eda7c 100644 --- a/annif/project.py +++ b/annif/project.py @@ -234,6 +234,7 @@ def suggest_corpus( self.suggest([doc.text for doc in doc_batch], backend_params) for doc_batch in corpus.doc_batches ) + import annif.suggestion return annif.suggestion.SuggestionResults(suggestions) From 252c75f1fe72a389e73593c78a22bc8c684b2de0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Fri, 2 Jun 2023 14:46:01 +0300 Subject: [PATCH 28/28] Make type optional as it should be --- annif/__init__.py | 1 - annif/backend/http.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/annif/__init__.py b/annif/__init__.py index dc353634b..bb196b4ee 100644 --- a/annif/__init__.py +++ b/annif/__init__.py @@ -11,7 +11,6 @@ logger = logging.getLogger("annif") logger.setLevel(level=logging.INFO) - import annif.backend # noqa if TYPE_CHECKING: diff --git a/annif/backend/http.py b/annif/backend/http.py index f57511f64..0fce7f8e4 100644 --- a/annif/backend/http.py +++ b/annif/backend/http.py @@ -32,7 +32,7 @@ def headers(self) -> dict[str, str]: return self._headers @property - def is_trained(self) -> bool: + def is_trained(self) -> bool | None: return self._get_project_info("is_trained") @property