Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add wtpsplit to sentence segmentation & paragraph segmentation #804

Merged
merged 4 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ khanaa==0.0.6
spacy_thai==0.7.1
esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
wtpsplit==1.0.1
fastcoref==2.1.6
1 change: 1 addition & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Modules

.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: paragraph_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: word_tokenize
.. autofunction:: word_detokenize
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"subword_tokenize",
"word_tokenize",
"word_detokenize",
"paragraph_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
Expand All @@ -46,6 +47,7 @@
subword_tokenize,
word_tokenize,
word_detokenize,
paragraph_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus
Expand Down
68 changes: 68 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,12 @@ def sent_tokenize(
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
* *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
* *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
It support many size of models. You can use ``wtp`` to use mini model, \
``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
``wtp-mini`` to use ``wtp-bert-mini`` model, \
``wtp-base`` to use ``wtp-canine-s-1l`` model, \
and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
* *whitespace+newline* - split by whitespaces and newline.
* *whitespace* - split by whitespaces. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
Expand Down Expand Up @@ -414,6 +420,13 @@ def sent_tokenize(

segment = segmentor()
segments = segment.split_into_sentences(text)
elif engine.startswith("wtp"):
if "-" not in engine:
_size="mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment
segments = segment(text,size=_size,tokenize="sentence")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand All @@ -426,6 +439,61 @@ def sent_tokenize(
return segments


def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
"""
Paragraph tokenizer.

Tokenizes text into paragraph.

:param str text: text to be tokenized
:param str engine: the name paragraph tokenizer
:return: list of paragraph
:rtype: List[List[str]]
**Options for engine**
* *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
It support many size of models. You can use ``wtp`` to use mini model, \
``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
``wtp-mini`` to use ``wtp-bert-mini`` model, \
``wtp-base`` to use ``wtp-canine-s-1l`` model, \
and ``wtp-large`` to use ``wtp-canine-s-12l`` model.

:Example:

Split the text based on *wtp*::

from pythainlp.tokenize import paragraph_tokenize

sent = (
"(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
)

paragraph_tokenize(sent)
# output: [
# ['(1) '],
# [
# 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ',
# 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
# 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
# 'ณ ที่นี้'
# ]]
"""
if engine.startswith("wtp"):
if "-" not in engine:
_size="mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment
segments = segment(text,size=_size,tokenize="paragraph")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
return segments


def subword_tokenize(
text: str,
engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
Expand Down
57 changes: 57 additions & 0 deletions pythainlp/tokenize/wtsplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation

GitHub: https://github.com/bminixhofer/wtpsplit
"""
from typing import List
from wtpsplit import WtP

_MODEL = None
_MODEL_NAME = None


def _tokenize(
text:str,
lang_code:str="th",
model:str="wtp-bert-mini",
tokenize:str="sentence"
)-> List[str]:
global _MODEL_NAME,_MODEL
if _MODEL_NAME != model:
_MODEL = WtP(model_name_or_model=model)
_MODEL_NAME = model
if tokenize=="sentence":
return _MODEL.split(text,lang_code=lang_code)
else: # Paragraph
return _MODEL.split(
text,
lang_code=lang_code,
do_paragraph_segmentation=True
)


def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
_model_load=""
if size=="tiny":
_model_load="wtp-bert-tiny"
elif size=="base":
_model_load="wtp-canine-s-1l"
elif size=="large":
_model_load="wtp-canine-s-12l"
else: # mini
_model_load="wtp-bert-mini"
return _tokenize(text, model=_model_load,tokenize=tokenize)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"sentencepiece>=0.1.91"
],
"mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
"wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
"wordnet": ["nltk>=3.3"],
"generate": ["fastai<2.0"],
"sefr_cut": ["sefr_cut>=1.1"],
Expand Down Expand Up @@ -140,6 +141,7 @@
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.3",
"wtpsplit>=1.0.1",
"spacy_thai>=0.7.1",
"spacy>=3.0",
"fastcoref>=2.1.5",
Expand Down
36 changes: 36 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
tltk,
oskut,
word_detokenize,
paragraph_tokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
from pythainlp.util import dict_trie
Expand Down Expand Up @@ -306,6 +307,30 @@ def test_sent_tokenize(self):
engine="thaisum",
),
)
self.assertIsNotNone(
sent_tokenize(
sent_3,
engine="wtp",
),
)
self.assertIsNotNone(
sent_tokenize(
sent_3,
engine="wtp-tiny",
),
)
# self.assertIsNotNone(
# sent_tokenize(
# sent_3,
# engine="wtp-base",
# ),
# )
# self.assertIsNotNone(
# sent_tokenize(
# sent_3,
# engine="wtp-large",
# ),
# )
self.assertFalse(
" "
in sent_tokenize(
Expand All @@ -317,6 +342,17 @@ def test_sent_tokenize(self):
with self.assertRaises(ValueError):
sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist

def test_paragraph_tokenize(self):
sent = (
"(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
+ "จากผลงานวิจัยที่เคยทำมาในอดีต"
+ " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+ " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
)
self.assertIsNotNone(paragraph_tokenize(sent))
with self.assertRaises(ValueError):
paragraph_tokenize(sent, engine="ai2+2thai")

def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None), [])
self.assertEqual(subword_tokenize(""), [])
Expand Down