Skip to content

Commit e2a3404

Browse files
authored
Merge pull request #706 from PyThaiNLP/add-dependency-parser
Add pythainlp.parse.dependency_parsing
2 parents 638c28d + e9b5ffb commit e2a3404

File tree

10 files changed

+268
-4
lines changed

10 files changed

+268
-4
lines changed

docker_requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ sentencepiece==0.1.91
1212
ssg==0.0.8
1313
torch==1.8.1
1414
fastai==1.0.61
15-
transformers==4.8.2
15+
transformers==4.22.1
1616
phunspell==0.1.6
1717
spylls==0.1.5
1818
symspellpy==6.7.6
@@ -31,3 +31,6 @@ thai-nner==0.3
3131
spacy==2.3.*
3232
wunsen==0.0.3
3333
khanaa==0.0.6
34+
spacy_thai==0.7.1
35+
esupar==1.3.8
36+
ufal.chu-liu-edmonds==1.0.2

docs/api/parse.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.. currentmodule:: pythainlp.parse
2+
3+
pythainlp.parse
4+
===============
5+
The :class:`pythainlp.parse` is dependency parsing for Thai.
6+
7+
Modules
8+
-------
9+
10+
.. autofunction:: dependency_parsing

docs/notes/installation.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ where ``extras`` can be
3131
- ``tltk`` (to support tltk)
3232
- ``textaugment`` (to support text augmentation)
3333
- ``oskut`` (to support OSKUT)
34-
- ``nlpo3`` (to support nlpo3 enging)
34+
- ``nlpo3`` (to support nlpo3 engine)
35+
- ``spacy_thai`` (to support spacy_thai engine)
36+
- ``esupar`` (to support esupar engine)
37+
- ``transformers_ud`` (to support transformers_ud engine)
38+
- ``dependency_parsing`` (to support dependency parsing with all engine)
3539
- ``full`` (install everything)
3640

3741
For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

pythainlp/parse/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
PyThaiNLP Parse
4+
"""
5+
__all__ = [
6+
"dependency_parsing"
7+
]
8+
from pythainlp.parse.core import dependency_parsing

pythainlp/parse/core.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# -*- coding: utf-8 -*-
2+
_tagger = None
3+
_tagger_name = ""
4+
5+
def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str:
6+
"""
7+
Dependency Parsing
8+
9+
:param str text: text to do dependency parsing
10+
:param str model: model for using with engine \
11+
(for esupar and transformers_ud)
12+
:param str engine: the name dependency parser
13+
:return: str (conllu)
14+
15+
**Options for engine**
16+
* *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \
17+
with BERT/RoBERTa/DeBERTa model. `GitHub \
18+
<https://github.com/KoichiYasuoka/esupar>`_
19+
* *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \
20+
for Thai language, working on Universal Dependencies. \
21+
`GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_
22+
* *transformers_ud* - TransformersUD \
23+
`GitHub <https://github.com/KoichiYasuoka/>`_
24+
25+
**Options for model (esupar engine)**
26+
* *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
27+
`Huggingface \
28+
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_
29+
* *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \
30+
pre-trained on Thai Wikipedia texts for POS-tagging and \
31+
dependency-parsing `Huggingface \
32+
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_
33+
* *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \
34+
pre-trained on Thai Wikipedia texts for POS-tagging and \
35+
dependency-parsing. (syllable level) `Huggingface \
36+
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_
37+
* *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \
38+
pre-trained on Thai Wikipedia texts for POS-tagging \
39+
and dependency-parsing. (char level) `Huggingface \
40+
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_
41+
42+
If you want to train model for esupar, you can read \
43+
`Huggingface <https://github.com/KoichiYasuoka/esupar>`_
44+
45+
**Options for model (transformers_ud engine)**
46+
* *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \
47+
DeBERTa(V2) model pretrained on Thai Wikipedia texts \
48+
for dependency-parsing (head-detection on Universal \
49+
Dependencies) as question-answering, derived from \
50+
deberta-base-thai. \
51+
trained by th_blackboard.conll. `Huggingface \
52+
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_
53+
* *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \
54+
roberta model pretrained on Thai Wikipedia texts \
55+
for dependency-parsing. `Huggingface \
56+
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_
57+
58+
:Example:
59+
::
60+
61+
from pythainlp.parse import dependency_parsing
62+
63+
print(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
64+
# output:
65+
# 1 ผม _ PRON _ _ 3 nsubj _ SpaceAfter=No
66+
# 2 เป็น _ VERB _ _ 3 cop _ SpaceAfter=No
67+
# 3 คน _ NOUN _ _ 0 root _ SpaceAfter=No
68+
# 4 ดี _ VERB _ _ 3 acl _ SpaceAfter=No
69+
70+
print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))
71+
# output:
72+
# 1 ผม PRON PPRS _ 2 nsubj _ SpaceAfter=No
73+
# 2 เป็น VERB VSTA _ 0 ROOT _ SpaceAfter=No
74+
# 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No
75+
"""
76+
global _tagger, _tagger_name
77+
if _tagger_name != engine:
78+
if engine == "esupar":
79+
from pythainlp.parse.esupar_engine import Parse
80+
_tagger = Parse(model=model)
81+
elif engine == "transformers_ud":
82+
from pythainlp.parse.transformers_ud import Parse
83+
_tagger = Parse(model=model)
84+
elif engine == "spacy_thai":
85+
from pythainlp.parse.spacy_thai_engine import Parse
86+
_tagger = Parse()
87+
else:
88+
raise NotImplementedError(
89+
"The engine doesn't support."
90+
)
91+
_tagger_name = engine
92+
return _tagger(text)

pythainlp/parse/esupar_engine.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages
4+
5+
GitHub: https://github.com/KoichiYasuoka/esupar
6+
"""
7+
import esupar
8+
9+
10+
class Parse:
11+
def __init__(self, model: str="th") -> None:
12+
if model == None:
13+
model = "th"
14+
self.nlp=esupar.load(model)
15+
16+
def __call__(self, text):
17+
return self.nlp(text)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies.
4+
5+
GitHub: https://github.com/KoichiYasuoka/spacy-thai
6+
"""
7+
import spacy_thai
8+
9+
10+
class Parse:
11+
def __init__(self, model: str="th") -> None:
12+
self.nlp=spacy_thai.load()
13+
14+
def __call__(self, text:str)->str:
15+
doc = self.nlp(text)
16+
_text = []
17+
for t in doc:
18+
_text.append("\t".join([str(t.i+1),t.orth_,t.lemma_,t.pos_,t.tag_,"_",str(0 if t.head==t else t.head.i+1),t.dep_,"_","_" if t.whitespace_ else "SpaceAfter=No"]))
19+
return '\n'.join(_text)

pythainlp/parse/transformers_ud.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
TransformersUD
4+
5+
Author: Prof. Koichi Yasuoka
6+
7+
This tagger is provided under the terms of the apache-2.0 License.
8+
9+
The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head
10+
11+
GitHub: https://github.com/KoichiYasuoka
12+
"""
13+
import os
14+
import numpy
15+
import torch
16+
import ufal.chu_liu_edmonds
17+
from transformers import (
18+
AutoTokenizer,
19+
AutoModelForQuestionAnswering,
20+
AutoModelForTokenClassification,
21+
AutoConfig,
22+
TokenClassificationPipeline
23+
)
24+
from transformers.utils import cached_file
25+
26+
27+
class Parse:
28+
def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None:
29+
if model == None:
30+
model = "KoichiYasuoka/deberta-base-thai-ud-head"
31+
self.tokenizer=AutoTokenizer.from_pretrained(model)
32+
self.model=AutoModelForQuestionAnswering.from_pretrained(model)
33+
x=AutoModelForTokenClassification.from_pretrained
34+
if os.path.isdir(model):
35+
d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger"))
36+
else:
37+
c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json"))
38+
d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c)
39+
s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json"))
40+
t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s)
41+
self.deprel=TokenClassificationPipeline(
42+
model=d,
43+
tokenizer=self.tokenizer,
44+
aggregation_strategy="simple"
45+
)
46+
self.tagger=TokenClassificationPipeline(
47+
model=t,
48+
tokenizer=self.tokenizer
49+
)
50+
51+
def __call__(self, text: str)->str:
52+
w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
53+
z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
54+
r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
55+
v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
56+
for i,t in enumerate(v):
57+
q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
58+
c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
59+
b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
60+
with torch.no_grad():
61+
d=self.model(
62+
input_ids=torch.tensor([sum(x,[]) for x in c]),
63+
token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b])
64+
)
65+
s,e=d.start_logits.tolist(),d.end_logits.tolist()
66+
for i in range(n):
67+
for j in range(n):
68+
m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
69+
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
70+
if [0 for i in h if i==0]!=[0]:
71+
i=([p for s,e,p in w]+["root"]).index("root")
72+
j=i+1 if i<n else numpy.nanargmax(m[:,0])
73+
m[0:j,0]=m[j+1:,0]=numpy.nan
74+
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
75+
u=""
76+
for i,(s,e,p) in enumerate(w,1):
77+
p="root" if h[i]==0 else "dep" if p=="root" else p
78+
u+="\t".join(
79+
[str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"]
80+
)+"\n"
81+
return u+"\n"

setup.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,22 @@
8181
"onnxruntime>=1.10.0"
8282
],
8383
"thai_nner": ["thai_nner"],
84+
"esupar": [
85+
"esupar>=1.3.8",
86+
"numpy",
87+
"transformers>=4.22.1",
88+
],
89+
"spacy_thai": ["spacy_thai>=0.7.1"],
90+
"transformers_ud": [
91+
"ufal.chu-liu-edmonds>=1.0.2",
92+
"transformers>=4.22.1",
93+
],
94+
"dependency_parsing": [
95+
"esupar>=1.3.8",
96+
"spacy_thai>=0.7.1",
97+
"ufal.chu-liu-edmonds>=1.0.2",
98+
"transformers>=4.22.1",
99+
],
84100
"full": [
85101
"PyYAML>=5.3.1",
86102
"attacut>=1.0.4",
@@ -98,7 +114,7 @@
98114
"torch>=1.0.0",
99115
"fastai<2.0",
100116
"bpemb>=0.3.2",
101-
"transformers>=4.6.0",
117+
"transformers>=4.22.1",
102118
"sefr_cut>=1.1",
103119
"phunspell>=0.1.6",
104120
"spylls>=0.1.5",
@@ -108,7 +124,10 @@
108124
"nlpo3>=1.2.2",
109125
"onnxruntime>=1.10.0",
110126
"thai_nner",
111-
"wunsen>=0.0.3"
127+
"wunsen>=0.0.3",
128+
"spacy_thai>=0.7.1",
129+
"esupar>=1.3.8",
130+
"ufal.chu-liu-edmonds>=1.0.2",
112131
],
113132
}
114133

tests/test_parse.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import unittest
4+
from pythainlp.parse import dependency_parsing
5+
6+
7+
class TestParsePackage(unittest.TestCase):
8+
def test_dependency_parsing(self):
9+
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
10+
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud"))
11+
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))

0 commit comments

Comments
 (0)