NatLibFi · osma · Dec 20, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -85,7 +85,7 @@ jobs:
         fi
         # For Python 3.10:
         if [[ ${{ matrix.python-version }} == '3.10' ]]; then
-          poetry install -E "fasttext spacy";
+          poetry install -E "fasttext spacy estnltk";
           # download the small English pretrained spaCy model needed by spacy analyzer
           poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
         fi

diff --git a/README.md b/README.md
@@ -231,13 +231,24 @@ https://doi.org/10.18352/lq.10285
 
 # License
 
-The code in this repository is licensed under Apache License 2.0, except for the
-dependencies included under `annif/static/css` and `annif/static/js`,
-which have their own licenses, see the file headers for details.
-Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended
-under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is
-licensed under the Apache License 2.0. The licenses are compatible, but
-depending on legal interpretation, the terms of the GPLv3 (for example the
-requirement to publish corresponding source code when publishing an executable
-application) may be considered to apply to the whole of Annif+Yake if you
-decide to install the optional Yake dependency.
+The code in this repository is licensed under Apache License 2.0, except for
+the dependencies included under `annif/static/css` and `annif/static/js`,
+which have their own licenses; see the file headers for details.
+
+Please note that the [YAKE](https://github.com/LIAAD/yake) library is
+licensed under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt) and the
+[EstNLTK-core](https://github.com/estnltk/estnltk/tree/main/estnltk_core)
+library is licensed under
+[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html), while Annif
+itself is licensed under the Apache License 2.0. It is commonly accepted
+that the GPLv3 and Apache 2.0 licenses are compatible at least in one
+direction (GPLv3 is more restrictive than the Apache License), while the
+compatibility between GPLv2 and Apache 2.0 licenses is a more difficult
+question with arguments made both for and against license compatibility;
+obviously it also depends on the legal environment. The Annif developers
+make no legal claims; we simply provide the software and allow the user to
+install these optional extensions if they consider it appropriate. Depending
+on legal interpretation, the terms of the GPL (for example the requirement
+to publish corresponding source code when publishing an executable
+application) may be considered to apply to the whole of Annif+extensions if
+you decide to install the optional Yake and/or EstNLTK dependencies.
diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -8,7 +8,7 @@
 import annif
 from annif.util import parse_args
 
-from . import simple, simplemma, snowball, spacy, voikko
+from . import estnltk, simple, simplemma, snowball, spacy, voikko
 
 if TYPE_CHECKING:
     from annif.analyzer.analyzer import Analyzer
@@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer:
 register_analyzer(simplemma.SimplemmaAnalyzer)
 register_analyzer(voikko.VoikkoAnalyzer)
 register_analyzer(spacy.SpacyAnalyzer)
+register_analyzer(estnltk.EstNLTKAnalyzer)
diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
@@ -0,0 +1,31 @@
+"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
+
+from __future__ import annotations
+
+import importlib
+
+from . import analyzer
+
+
+class EstNLTKAnalyzer(analyzer.Analyzer):
+    name = "estnltk"
+
+    @staticmethod
+    def is_available() -> bool:
+        # return True iff EstNLTK is installed
+        return importlib.util.find_spec("estnltk") is not None
+
+    def __init__(self, param: str, **kwargs) -> None:
+        self.param = param
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
+        import estnltk
+
+        txt = estnltk.Text(text.strip())
+        txt.tag_layer()
+        return [
+            lemma
+            for lemma in [lemmas[0] for lemmas in txt.lemma]
+            if (not filter or self.is_valid_token(lemma))
+        ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"
 
 fasttext-wheel = { version = "0.9.2", optional = true }
 voikko = { version = "0.5.*", optional = true }
+estnltk = { version = "1.7.3", optional = true }
 tensorflow-cpu = { version = "~2.17.0", optional = true }
 lmdb = { version = "~1.5.1", optional = true }
 omikuji = { version = "0.5.*", optional = true }
@@ -73,6 +74,7 @@ schemathesis = "3.*.*"
 [tool.poetry.extras]
 fasttext = ["fasttext-wheel"]
 voikko = ["voikko"]
+estnltk = ["estnltk"]
 nn = ["tensorflow-cpu", "lmdb"]
 omikuji = ["omikuji"]
 yake = ["yake"]

diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
@@ -0,0 +1,53 @@
+"""Unit tests for EstNLTK analyzer in Annif"""
+
+import pytest
+
+import annif.analyzer
+import annif.analyzer.estnltk
+
+pytestmark = pytest.mark.skipif(
+    not annif.analyzer.estnltk.EstNLTKAnalyzer.is_available(),
+    reason="EstNLTK is required",
+)
+
+
+def test_estnltk_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+    ]
+
+
+def test_estnltk_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """,
+        filter=False,
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        ".",
+        "ka",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+        ".",
+    ]