From 94d29dbd8d30c2af958db43bb01a866832b88c3e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:02:30 +0200 Subject: [PATCH 1/6] first implementation of EstNLTK analyzer support --- annif/analyzer/__init__.py | 3 ++- annif/analyzer/estnltk.py | 28 +++++++++++++++++++ pyproject.toml | 2 ++ tests/test_analyzer_estnltk.py | 49 ++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 annif/analyzer/estnltk.py create mode 100644 tests/test_analyzer_estnltk.py diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 81f52511..fcd57baf 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -8,7 +8,7 @@ import annif from annif.util import parse_args -from . import simple, simplemma, snowball, spacy, voikko +from . import estnltk, simple, simplemma, snowball, spacy, voikko if TYPE_CHECKING: from annif.analyzer.analyzer import Analyzer @@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer: register_analyzer(simplemma.SimplemmaAnalyzer) register_analyzer(voikko.VoikkoAnalyzer) register_analyzer(spacy.SpacyAnalyzer) +register_analyzer(estnltk.EstNLTKAnalyzer) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py new file mode 100644 index 00000000..1836af70 --- /dev/null +++ b/annif/analyzer/estnltk.py @@ -0,0 +1,28 @@ +"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" + +from __future__ import annotations + +import annif.util +from annif.exception import OperationFailedException + +from . import analyzer + + +class EstNLTKAnalyzer(analyzer.Analyzer): + name = "estnltk" + + def __init__(self, param: str, **kwargs) -> None: + self.param = param + super().__init__(**kwargs) + + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: + import estnltk + + txt = estnltk.Text(text.strip()) + txt.tag_layer() + lemmas = [ + lemma + for lemma in [l[0] for l in txt.lemma] + if (not filter or self.is_valid_token(lemma)) + ] + return lemmas diff --git a/pyproject.toml b/pyproject.toml index de8410e6..924ae9ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1" fasttext-wheel = { version = "0.9.2", optional = true } voikko = { version = "0.5.*", optional = true } +estnltk = { version = "1.7.3", optional = true } tensorflow-cpu = { version = "~2.17.0", optional = true } lmdb = { version = "~1.5.1", optional = true } omikuji = { version = "0.5.*", optional = true } @@ -73,6 +74,7 @@ schemathesis = "3.*.*" [tool.poetry.extras] fasttext = ["fasttext-wheel"] voikko = ["voikko"] +estnltk = ["estnltk"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py new file mode 100644 index 00000000..61168808 --- /dev/null +++ b/tests/test_analyzer_estnltk.py @@ -0,0 +1,49 @@ +"""Unit tests for EstNLTK analyzer in Annif""" + +import pytest + +import annif.analyzer + +estnltk = pytest.importorskip("annif.analyzer.estnltk") + + +def test_estnltk_tokenize_words(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """ + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + "köök", + "olema", + "kõik", + "endine", + ] + + +def test_estnltk_tokenize_words_no_filter(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """, + filter=False, + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + ".", + "ka", + "köök", + "olema", + "kõik", + "endine", + ".", + ] From 51e841b35053eb6222212d2fc9c769ef0ea08970 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:05:59 +0200 Subject: [PATCH 2/6] add estnltk dependency to CI/CD tests for Python 3.10 --- .github/workflows/cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 8172bcc2..e044ec95 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -85,7 +85,7 @@ jobs: fi # For Python 3.10: if [[ ${{ matrix.python-version }} == '3.10' ]]; then - poetry install -E "fasttext spacy"; + poetry install -E "fasttext spacy estnltk"; # download the small English pretrained spaCy model needed by spacy analyzer poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed fi From f9863dca9c428b0d524d37f4b0b508e35b5216dc Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:13:22 +0200 Subject: [PATCH 3/6] remove unused imports --- annif/analyzer/estnltk.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py index 1836af70..c320d170 100644 --- a/annif/analyzer/estnltk.py +++ b/annif/analyzer/estnltk.py @@ -2,9 +2,6 @@ from __future__ import annotations -import annif.util -from annif.exception import OperationFailedException - from . import analyzer From 35b8955b821bf7b5a6bf416089cac4e7016d9fba Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:13:40 +0200 Subject: [PATCH 4/6] fix test for estnltk install --- tests/test_analyzer_estnltk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py index 61168808..5449efcf 100644 --- a/tests/test_analyzer_estnltk.py +++ b/tests/test_analyzer_estnltk.py @@ -4,7 +4,7 @@ import annif.analyzer -estnltk = pytest.importorskip("annif.analyzer.estnltk") +estnltk = pytest.importorskip("estnltk") def test_estnltk_tokenize_words(): From 66f577d5983e9fe8c7043d410d9186b6e4a3afef Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 12 Nov 2024 21:14:41 +0200 Subject: [PATCH 5/6] refactor code to avoid flake8 warning --- annif/analyzer/estnltk.py | 12 +++++++++--- tests/test_analyzer_estnltk.py | 6 +++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py index c320d170..9c2f38be 100644 --- a/annif/analyzer/estnltk.py +++ b/annif/analyzer/estnltk.py @@ -2,12 +2,19 @@ from __future__ import annotations +import importlib + from . import analyzer class EstNLTKAnalyzer(analyzer.Analyzer): name = "estnltk" + @staticmethod + def is_available() -> bool: + # return True iff EstNLTK is installed + return importlib.util.find_spec("estnltk") is not None + def __init__(self, param: str, **kwargs) -> None: self.param = param super().__init__(**kwargs) @@ -17,9 +24,8 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]: txt = estnltk.Text(text.strip()) txt.tag_layer() - lemmas = [ + return [ lemma - for lemma in [l[0] for l in txt.lemma] + for lemma in [lemmas[0] for lemmas in txt.lemma] if (not filter or self.is_valid_token(lemma)) ] - return lemmas diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py index 5449efcf..3892b422 100644 --- a/tests/test_analyzer_estnltk.py +++ b/tests/test_analyzer_estnltk.py @@ -3,8 +3,12 @@ import pytest import annif.analyzer +import annif.analyzer.estnltk -estnltk = pytest.importorskip("estnltk") +pytestmark = pytest.mark.skipif( + not annif.analyzer.estnltk.EstNLTKAnalyzer.is_available(), + reason="EstNLTK is required", +) def test_estnltk_tokenize_words(): From 407a3185e12e9de7bbc84904af7b11ffafccd374 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 25 Nov 2024 13:06:11 +0200 Subject: [PATCH 6/6] clarify(?) licensing situation w.r.t. YAKE and EstNLTK --- README.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 54ffd462..1f6ef9c3 100644 --- a/README.md +++ b/README.md @@ -231,13 +231,24 @@ https://doi.org/10.18352/lq.10285 # License -The code in this repository is licensed under Apache License 2.0, except for the -dependencies included under `annif/static/css` and `annif/static/js`, -which have their own licenses, see the file headers for details. -Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended -under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is -licensed under the Apache License 2.0. The licenses are compatible, but -depending on legal interpretation, the terms of the GPLv3 (for example the -requirement to publish corresponding source code when publishing an executable -application) may be considered to apply to the whole of Annif+Yake if you -decide to install the optional Yake dependency. +The code in this repository is licensed under Apache License 2.0, except for +the dependencies included under `annif/static/css` and `annif/static/js`, +which have their own licenses; see the file headers for details. + +Please note that the [YAKE](https://github.com/LIAAD/yake) library is +licensed under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt) and the +[EstNLTK-core](https://github.com/estnltk/estnltk/tree/main/estnltk_core) +library is licensed under +[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html), while Annif +itself is licensed under the Apache License 2.0. It is commonly accepted +that the GPLv3 and Apache 2.0 licenses are compatible at least in one +direction (GPLv3 is more restrictive than the Apache License), while the +compatibility between GPLv2 and Apache 2.0 licenses is a more difficult +question with arguments made both for and against license compatibility; +obviously it also depends on the legal environment. The Annif developers +make no legal claims; we simply provide the software and allow the user to +install these optional extensions if they consider it appropriate. Depending +on legal interpretation, the terms of the GPL (for example the requirement +to publish corresponding source code when publishing an executable +application) may be considered to apply to the whole of Annif+extensions if +you decide to install the optional Yake and/or EstNLTK dependencies.