diff --git a/docker_requirements.txt b/docker_requirements.txt
index 03f8ce3db..72fe9e02e 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -34,4 +34,5 @@ khanaa==0.0.6
spacy_thai==0.7.1
esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
+wtpsplit==1.0.1
fastcoref==2.1.6
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
index ced072da4..dcec5dc07 100644
--- a/docs/api/tokenize.rst
+++ b/docs/api/tokenize.rst
@@ -10,6 +10,7 @@ Modules
.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
+.. autofunction:: paragraph_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: word_tokenize
.. autofunction:: word_detokenize
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 39d7a7151..674153cc7 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -25,6 +25,7 @@
"subword_tokenize",
"word_tokenize",
"word_detokenize",
+ "paragraph_tokenize",
]
from pythainlp.corpus import thai_syllables, thai_words
@@ -46,6 +47,7 @@
subword_tokenize,
word_tokenize,
word_detokenize,
+ paragraph_tokenize,
)
from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 2482d08ff..73b98a88a 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -344,6 +344,12 @@ def sent_tokenize(
* *thaisum* - The implementation of sentence segmentator from \
Nakhun Chumpolsathien, 2020
* *tltk* - split by `TLTK `_.,
+ * *wtp* - split by `wtpsplitaxe `_., \
+ It support many size of models. You can use ``wtp`` to use mini model, \
+ ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+ ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+ ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+ and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
* *whitespace+newline* - split by whitespaces and newline.
* *whitespace* - split by whitespaces. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
@@ -414,6 +420,13 @@ def sent_tokenize(
segment = segmentor()
segments = segment.split_into_sentences(text)
+ elif engine.startswith("wtp"):
+ if "-" not in engine:
+ _size="mini"
+ else:
+ _size = engine.split("-")[-1]
+ from pythainlp.tokenize.wtsplit import tokenize as segment
+ segments = segment(text,size=_size,tokenize="sentence")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
@@ -426,6 +439,61 @@ def sent_tokenize(
return segments
+def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
+ """
+ Paragraph tokenizer.
+
+ Tokenizes text into paragraph.
+
+ :param str text: text to be tokenized
+ :param str engine: the name paragraph tokenizer
+ :return: list of paragraph
+ :rtype: List[List[str]]
+ **Options for engine**
+ * *wtp* - split by `wtpsplitaxe `_., \
+ It support many size of models. You can use ``wtp`` to use mini model, \
+ ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+ ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+ ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+ and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
+
+ :Example:
+
+ Split the text based on *wtp*::
+
+ from pythainlp.tokenize import paragraph_tokenize
+
+ sent = (
+ "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+ +" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+ +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+ )
+
+ paragraph_tokenize(sent)
+ # output: [
+ # ['(1) '],
+ # [
+ # 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ',
+ # 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
+ # 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
+ # 'ณ ที่นี้'
+ # ]]
+ """
+ if engine.startswith("wtp"):
+ if "-" not in engine:
+ _size="mini"
+ else:
+ _size = engine.split("-")[-1]
+ from pythainlp.tokenize.wtsplit import tokenize as segment
+ segments = segment(text,size=_size,tokenize="paragraph")
+ else:
+ raise ValueError(
+ f"""Tokenizer \"{engine}\" not found.
+ It might be a typo; if not, please consult our document."""
+ )
+ return segments
+
+
def subword_tokenize(
text: str,
engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
new file mode 100644
index 000000000..20c8a8eb1
--- /dev/null
+++ b/pythainlp/tokenize/wtsplit.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation
+
+GitHub: https://github.com/bminixhofer/wtpsplit
+"""
+from typing import List
+from wtpsplit import WtP
+
+_MODEL = None
+_MODEL_NAME = None
+
+
+def _tokenize(
+ text:str,
+ lang_code:str="th",
+ model:str="wtp-bert-mini",
+ tokenize:str="sentence"
+ )-> List[str]:
+ global _MODEL_NAME,_MODEL
+ if _MODEL_NAME != model:
+ _MODEL = WtP(model_name_or_model=model)
+ _MODEL_NAME = model
+ if tokenize=="sentence":
+ return _MODEL.split(text,lang_code=lang_code)
+ else: # Paragraph
+ return _MODEL.split(
+ text,
+ lang_code=lang_code,
+ do_paragraph_segmentation=True
+ )
+
+
+def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
+ _model_load=""
+ if size=="tiny":
+ _model_load="wtp-bert-tiny"
+ elif size=="base":
+ _model_load="wtp-canine-s-1l"
+ elif size=="large":
+ _model_load="wtp-canine-s-12l"
+ else: # mini
+ _model_load="wtp-bert-mini"
+ return _tokenize(text, model=_model_load,tokenize=tokenize)
diff --git a/setup.py b/setup.py
index b2ca9e021..10ca6b107 100644
--- a/setup.py
+++ b/setup.py
@@ -78,6 +78,7 @@
"sentencepiece>=0.1.91"
],
"mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
+ "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
"wordnet": ["nltk>=3.3"],
"generate": ["fastai<2.0"],
"sefr_cut": ["sefr_cut>=1.1"],
@@ -140,6 +141,7 @@
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.3",
+ "wtpsplit>=1.0.1",
"spacy_thai>=0.7.1",
"spacy>=3.0",
"fastcoref>=2.1.5",
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 18e4cacbc..4659ff08c 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -23,6 +23,7 @@
tltk,
oskut,
word_detokenize,
+ paragraph_tokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
from pythainlp.util import dict_trie
@@ -306,6 +307,30 @@ def test_sent_tokenize(self):
engine="thaisum",
),
)
+ self.assertIsNotNone(
+ sent_tokenize(
+ sent_3,
+ engine="wtp",
+ ),
+ )
+ self.assertIsNotNone(
+ sent_tokenize(
+ sent_3,
+ engine="wtp-tiny",
+ ),
+ )
+ # self.assertIsNotNone(
+ # sent_tokenize(
+ # sent_3,
+ # engine="wtp-base",
+ # ),
+ # )
+ # self.assertIsNotNone(
+ # sent_tokenize(
+ # sent_3,
+ # engine="wtp-large",
+ # ),
+ # )
self.assertFalse(
" "
in sent_tokenize(
@@ -317,6 +342,17 @@ def test_sent_tokenize(self):
with self.assertRaises(ValueError):
sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist
+ def test_paragraph_tokenize(self):
+ sent = (
+ "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
+ + "จากผลงานวิจัยที่เคยทำมาในอดีต"
+ + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+ + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+ )
+ self.assertIsNotNone(paragraph_tokenize(sent))
+ with self.assertRaises(ValueError):
+ paragraph_tokenize(sent, engine="ai2+2thai")
+
def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None), [])
self.assertEqual(subword_tokenize(""), [])