PyThaiNLP · wannaphong · Jun 6, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -34,4 +34,5 @@ khanaa==0.0.6
 spacy_thai==0.7.1
 esupar==1.3.8
 ufal.chu-liu-edmonds==1.0.2
+wtpsplit==1.0.1
 fastcoref==2.1.6
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -10,6 +10,7 @@ Modules
 
 .. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
+.. autofunction:: paragraph_tokenize
 .. autofunction:: subword_tokenize
 .. autofunction:: word_tokenize
 .. autofunction:: word_detokenize

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -25,6 +25,7 @@
     "subword_tokenize",
     "word_tokenize",
     "word_detokenize",
+    "paragraph_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -46,6 +47,7 @@
     subword_tokenize,
     word_tokenize,
     word_detokenize,
+    paragraph_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -344,6 +344,12 @@ def sent_tokenize(
         * *thaisum* - The implementation of sentence segmentator from \
             Nakhun Chumpolsathien, 2020
         * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
+        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
+            It support many size of models. You can use ``wtp`` to use mini model, \
+            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
         * *whitespace+newline* - split by whitespaces and newline.
         * *whitespace* - split by whitespaces. Specifiaclly, with \
                          :class:`regex` pattern  ``r" +"``
@@ -414,6 +420,13 @@ def sent_tokenize(
 
         segment = segmentor()
         segments = segment.split_into_sentences(text)
+    elif engine.startswith("wtp"):
+        if "-" not in engine:
+            _size="mini"
+        else:
+            _size = engine.split("-")[-1]
+        from pythainlp.tokenize.wtsplit import tokenize as segment
+        segments = segment(text,size=_size,tokenize="sentence")
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
@@ -426,6 +439,61 @@ def sent_tokenize(
     return segments
 
 
+def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
+    """
+    Paragraph tokenizer.
+
+    Tokenizes text into paragraph.
+
+    :param str text: text to be tokenized
+    :param str engine: the name paragraph tokenizer
+    :return: list of paragraph
+    :rtype: List[List[str]]
+    **Options for engine**
+        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
+            It support many size of models. You can use ``wtp`` to use mini model, \
+            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
+
+    :Example:
+
+    Split the text based on *wtp*::
+
+        from pythainlp.tokenize import paragraph_tokenize
+
+        sent = (
+            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+            +"  มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+            +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+        )
+
+        paragraph_tokenize(sent)
+        # output: [
+        # ['(1) '], 
+        # [
+        #   'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต  ',
+        #   'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
+        #   'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
+        #   'ณ ที่นี้'
+        # ]]
+    """
+    if engine.startswith("wtp"):
+        if "-" not in engine:
+            _size="mini"
+        else:
+            _size = engine.split("-")[-1]
+        from pythainlp.tokenize.wtsplit import tokenize as segment
+        segments = segment(text,size=_size,tokenize="paragraph")
+    else:
+        raise ValueError(
+            f"""Tokenizer \"{engine}\" not found.
+            It might be a typo; if not, please consult our document."""
+        )
+    return segments
+
+
 def subword_tokenize(
     text: str,
     engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,

diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation
+
+GitHub: https://github.com/bminixhofer/wtpsplit
+"""
+from typing import List
+from wtpsplit import WtP
+
+_MODEL = None
+_MODEL_NAME = None
+
+
+def _tokenize(
+        text:str,
+        lang_code:str="th",
+        model:str="wtp-bert-mini",
+        tokenize:str="sentence"
+    )-> List[str]:
+    global _MODEL_NAME,_MODEL
+    if _MODEL_NAME != model:
+        _MODEL = WtP(model_name_or_model=model)
+        _MODEL_NAME = model
+    if tokenize=="sentence":
+        return _MODEL.split(text,lang_code=lang_code)
+    else: # Paragraph
+        return _MODEL.split(
+            text,
+            lang_code=lang_code,
+            do_paragraph_segmentation=True
+        )
+
+
+def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
+    _model_load=""
+    if size=="tiny":
+        _model_load="wtp-bert-tiny"
+    elif size=="base":
+        _model_load="wtp-canine-s-1l"
+    elif size=="large":
+        _model_load="wtp-canine-s-12l"
+    else:  # mini
+        _model_load="wtp-bert-mini"
+    return _tokenize(text, model=_model_load,tokenize=tokenize)
diff --git a/setup.py b/setup.py
@@ -78,6 +78,7 @@
         "sentencepiece>=0.1.91"
     ],
     "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
+    "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
     "wordnet": ["nltk>=3.3"],
     "generate": ["fastai<2.0"],
     "sefr_cut": ["sefr_cut>=1.1"],
@@ -140,6 +141,7 @@
         "onnxruntime>=1.10.0",
         "thai_nner",
         "wunsen>=0.0.3",
+        "wtpsplit>=1.0.1",
         "spacy_thai>=0.7.1",
         "spacy>=3.0",
         "fastcoref>=2.1.5",

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -23,6 +23,7 @@
     tltk,
     oskut,
     word_detokenize,
+    paragraph_tokenize,
 )
 from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 from pythainlp.util import dict_trie
@@ -306,6 +307,30 @@ def test_sent_tokenize(self):
                 engine="thaisum",
             ),
         )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp",
+            ),
+        )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp-tiny",
+            ),
+        )
+        # self.assertIsNotNone(
+        #     sent_tokenize(
+        #         sent_3,
+        #         engine="wtp-base",
+        #     ),
+        # )
+        # self.assertIsNotNone(
+        #     sent_tokenize(
+        #         sent_3,
+        #         engine="wtp-large",
+        #     ),
+        # )
         self.assertFalse(
             " "
             in sent_tokenize(
@@ -317,6 +342,17 @@ def test_sent_tokenize(self):
         with self.assertRaises(ValueError):
             sent_tokenize("ฉันไป กิน", engine="XX")  # engine does not exist
 
+    def test_paragraph_tokenize(self):
+        sent = (
+            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
+            + "จากผลงานวิจัยที่เคยทำมาในอดีต"
+            + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+            + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+        )
+        self.assertIsNotNone(paragraph_tokenize(sent))
+        with self.assertRaises(ValueError):
+            paragraph_tokenize(sent, engine="ai2+2thai")
+
     def test_subword_tokenize(self):
         self.assertEqual(subword_tokenize(None), [])
         self.assertEqual(subword_tokenize(""), [])