Merge pull request #706 from PyThaiNLP/add-dependency-parser

wannaphong · web-flow · commit e2a340474c56 · 2022-09-17T16:02:58.000+07:00
Add pythainlp.parse.dependency_parsing
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -12,7 +12,7 @@ sentencepiece==0.1.91
 ssg==0.0.8
 torch==1.8.1
 fastai==1.0.61
-transformers==4.8.2
+transformers==4.22.1
 phunspell==0.1.6
 spylls==0.1.5
 symspellpy==6.7.6
@@ -31,3 +31,6 @@ thai-nner==0.3
 spacy==2.3.*
 wunsen==0.0.3
 khanaa==0.0.6
+spacy_thai==0.7.1
+esupar==1.3.8
+ufal.chu-liu-edmonds==1.0.2
diff --git a/docs/api/parse.rst b/docs/api/parse.rst
@@ -0,0 +1,10 @@
+.. currentmodule:: pythainlp.parse
+
+pythainlp.parse
+===============
+The :class:`pythainlp.parse` is dependency parsing for Thai.
+
+Modules
+-------
+
+.. autofunction:: dependency_parsing
diff --git a/docs/notes/installation.rst b/docs/notes/installation.rst
@@ -31,7 +31,11 @@ where ``extras`` can be
   - ``tltk`` (to support tltk)
   - ``textaugment`` (to support text augmentation)
   - ``oskut`` (to support OSKUT)
-  - ``nlpo3`` (to support nlpo3 enging)
+  - ``nlpo3`` (to support nlpo3 engine)
+  - ``spacy_thai`` (to support spacy_thai engine)
+  - ``esupar`` (to support esupar engine)
+  - ``transformers_ud`` (to support transformers_ud engine)
+  - ``dependency_parsing`` (to support dependency parsing with all engine)
   - ``full`` (install everything)
 
 For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
diff --git a/pythainlp/parse/__init__.py b/pythainlp/parse/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+PyThaiNLP Parse
+"""
+__all__ = [
+    "dependency_parsing"
+]
+from pythainlp.parse.core import dependency_parsing
diff --git a/pythainlp/parse/core.py b/pythainlp/parse/core.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+_tagger = None
+_tagger_name = ""
+
+def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str:
+    """
+    Dependency Parsing
+
+    :param str text: text to do dependency parsing
+    :param str model: model for using with engine \
+        (for esupar and transformers_ud)
+    :param str engine: the name dependency parser
+    :return: str (conllu)
+
+    **Options for engine**
+        * *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \
+            with BERT/RoBERTa/DeBERTa model. `GitHub \
+                <https://github.com/KoichiYasuoka/esupar>`_
+        * *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \
+            for Thai language, working on Universal Dependencies. \
+            `GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_
+        * *transformers_ud* - TransformersUD \
+            `GitHub <https://github.com/KoichiYasuoka/>`_
+
+    **Options for model (esupar engine)**
+        * *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
+            `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_
+        * *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \
+            pre-trained on Thai Wikipedia texts for POS-tagging and \
+            dependency-parsing `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_
+        * *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \
+            pre-trained on Thai Wikipedia texts for POS-tagging and \
+            dependency-parsing. (syllable level) `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_
+        * *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \
+            pre-trained on Thai Wikipedia texts for POS-tagging \
+            and dependency-parsing. (char level) `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_
+
+    If you want to train model for esupar, you can read \
+    `Huggingface <https://github.com/KoichiYasuoka/esupar>`_
+
+    **Options for model (transformers_ud engine)**
+        * *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \
+            DeBERTa(V2) model pretrained on Thai Wikipedia texts \
+            for dependency-parsing (head-detection on Universal \
+            Dependencies) as question-answering, derived from \
+            deberta-base-thai. \
+            trained by th_blackboard.conll. `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_
+        * *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \
+            roberta model pretrained on Thai Wikipedia texts \
+            for dependency-parsing. `Huggingface \
+            <https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_
+
+    :Example:
+    ::
+
+        from pythainlp.parse import dependency_parsing
+
+        print(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
+        # output:
+        # 1       ผม      _       PRON    _       _       3       nsubj   _       SpaceAfter=No
+        # 2       เป็น     _       VERB    _       _       3       cop     _       SpaceAfter=No
+        # 3       คน      _       NOUN    _       _       0       root    _       SpaceAfter=No
+        # 4       ดี       _       VERB    _       _       3       acl     _       SpaceAfter=No
+
+        print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))
+        # output:
+        # 1       ผม              PRON    PPRS    _       2       nsubj   _       SpaceAfter=No
+        # 2       เป็น             VERB    VSTA    _       0       ROOT    _       SpaceAfter=No
+        # 3       คนดี             NOUN    NCMN    _       2       obj     _       SpaceAfter=No
+    """
+    global _tagger, _tagger_name
+    if _tagger_name != engine:
+        if engine == "esupar":
+            from pythainlp.parse.esupar_engine import Parse
+            _tagger = Parse(model=model)
+        elif engine == "transformers_ud":
+            from pythainlp.parse.transformers_ud import Parse
+            _tagger = Parse(model=model)
+        elif engine == "spacy_thai":
+            from pythainlp.parse.spacy_thai_engine import Parse
+            _tagger = Parse()
+        else:
+            raise NotImplementedError(
+                "The engine doesn't support."
+            )
+    _tagger_name = engine
+    return _tagger(text)
diff --git a/pythainlp/parse/esupar_engine.py b/pythainlp/parse/esupar_engine.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+"""
+esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages
+
+GitHub: https://github.com/KoichiYasuoka/esupar
+"""
+import esupar
+
+
+class Parse:
+    def __init__(self, model: str="th") -> None:
+        if model == None:
+            model = "th"
+        self.nlp=esupar.load(model)
+
+    def __call__(self, text):
+        return self.nlp(text)
diff --git a/pythainlp/parse/spacy_thai_engine.py b/pythainlp/parse/spacy_thai_engine.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies.
+
+GitHub: https://github.com/KoichiYasuoka/spacy-thai
+"""
+import spacy_thai
+
+
+class Parse:
+    def __init__(self, model: str="th") -> None:
+        self.nlp=spacy_thai.load()
+
+    def __call__(self, text:str)->str:
+        doc = self.nlp(text)
+        _text = []
+        for t in doc:
+            _text.append("\t".join([str(t.i+1),t.orth_,t.lemma_,t.pos_,t.tag_,"_",str(0 if t.head==t else t.head.i+1),t.dep_,"_","_" if t.whitespace_ else "SpaceAfter=No"]))
+        return '\n'.join(_text)
diff --git a/pythainlp/parse/transformers_ud.py b/pythainlp/parse/transformers_ud.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+TransformersUD
+
+Author: Prof. Koichi Yasuoka
+
+This tagger is provided under the terms of the apache-2.0 License.
+
+The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
+
+GitHub: https://github.com/KoichiYasuoka
+"""
+import os
+import numpy
+import torch
+import ufal.chu_liu_edmonds
+from transformers import (
+    AutoTokenizer,
+    AutoModelForQuestionAnswering,
+    AutoModelForTokenClassification,
+    AutoConfig,
+    TokenClassificationPipeline
+)
+from transformers.utils import cached_file
+
+
+class Parse:
+    def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None:
+        if model == None:
+            model = "KoichiYasuoka/deberta-base-thai-ud-head"
+        self.tokenizer=AutoTokenizer.from_pretrained(model)
+        self.model=AutoModelForQuestionAnswering.from_pretrained(model)
+        x=AutoModelForTokenClassification.from_pretrained
+        if os.path.isdir(model):
+            d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger"))
+        else:
+            c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json"))
+            d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c)
+            s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json"))
+            t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s)
+        self.deprel=TokenClassificationPipeline(
+            model=d,
+            tokenizer=self.tokenizer,
+            aggregation_strategy="simple"
+        )
+        self.tagger=TokenClassificationPipeline(
+            model=t,
+            tokenizer=self.tokenizer
+        )
+
+    def __call__(self, text: str)->str:
+        w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
+        z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
+        r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
+        v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
+        for i,t in enumerate(v):
+            q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
+            c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
+        b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
+        with torch.no_grad():
+            d=self.model(
+                input_ids=torch.tensor([sum(x,[]) for x in c]),
+                token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b])
+            )
+        s,e=d.start_logits.tolist(),d.end_logits.tolist()
+        for i in range(n):
+            for j in range(n):
+                m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
+        h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        if [0 for i in h if i==0]!=[0]:
+            i=([p for s,e,p in w]+["root"]).index("root")
+            j=i+1 if i<n else numpy.nanargmax(m[:,0])
+            m[0:j,0]=m[j+1:,0]=numpy.nan
+            h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
+        u=""
+        for i,(s,e,p) in enumerate(w,1):
+            p="root" if h[i]==0 else "dep" if p=="root" else p
+            u+="\t".join(
+                [str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"]
+            )+"\n"
+        return u+"\n"
diff --git a/setup.py b/setup.py
@@ -81,6 +81,22 @@
         "onnxruntime>=1.10.0"
     ],
     "thai_nner": ["thai_nner"],
+    "esupar": [
+        "esupar>=1.3.8",
+        "numpy",
+        "transformers>=4.22.1",
+    ],
+    "spacy_thai": ["spacy_thai>=0.7.1"],
+    "transformers_ud": [
+        "ufal.chu-liu-edmonds>=1.0.2",
+        "transformers>=4.22.1",
+    ],
+    "dependency_parsing": [
+        "esupar>=1.3.8",
+        "spacy_thai>=0.7.1",
+        "ufal.chu-liu-edmonds>=1.0.2",
+        "transformers>=4.22.1",
+    ],
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -98,7 +114,7 @@
         "torch>=1.0.0",
         "fastai<2.0",
         "bpemb>=0.3.2",
-        "transformers>=4.6.0",
+        "transformers>=4.22.1",
         "sefr_cut>=1.1",
         "phunspell>=0.1.6",
         "spylls>=0.1.5",
@@ -108,7 +124,10 @@
         "nlpo3>=1.2.2",
         "onnxruntime>=1.10.0",
         "thai_nner",
-        "wunsen>=0.0.3"
+        "wunsen>=0.0.3",
+        "spacy_thai>=0.7.1",
+        "esupar>=1.3.8",
+        "ufal.chu-liu-edmonds>=1.0.2",
     ],
 }
 
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+from pythainlp.parse import dependency_parsing
+
+
+class TestParsePackage(unittest.TestCase):
+    def test_dependency_parsing(self):
+        self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
+        self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud"))
+        self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))