Skip to content

Commit

Permalink
Merge pull request #1024 from PyThaiNLP/remove-clause_tokenize
Browse files Browse the repository at this point in the history
Remove clause_tokenize
  • Loading branch information
bact authored Dec 12, 2024
2 parents 216d443 + 3e8501f commit 2287b30
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 127 deletions.
4 changes: 0 additions & 4 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions a
Modules
-------

.. autofunction:: clause_tokenize
:noindex:

Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks.

.. autofunction:: sent_tokenize
:noindex:
Expand Down
2 changes: 0 additions & 2 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"THAI2FIT_TOKENIZER",
"Tokenizer",
"Trie",
"clause_tokenize",
"paragraph_tokenize",
"sent_tokenize",
"subword_tokenize",
Expand All @@ -32,7 +31,6 @@

from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
paragraph_tokenize,
sent_tokenize,
subword_tokenize,
Expand Down
37 changes: 0 additions & 37 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,6 @@
from pythainlp.util.trie import Trie, dict_trie


def clause_tokenize(doc: List[str]) -> List[List[str]]:
"""
Clause tokenizer. (or Clause segmentation)
Tokenizes running word list into list of clauses (list of strings).
Split by CRF trained on Blackboard Treebank.
:param str doc: word list to be clause tokenized
:return: list of clauses
:rtype: list[list[str]]
:Example:
::
from pythainlp.tokenize import clause_tokenize
clause_tokenize(
[
"ฉัน",
"นอน",
"และ",
"คุณ",
"เล่น",
"มือถือ",
"ส่วน",
"น้อง",
"เขียน",
"โปรแกรม",
]
)
# [['ฉัน', 'นอน'],
# ['และ', 'คุณ', 'เล่น', 'มือถือ'],
# ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
from pythainlp.tokenize.crfcls import segment

return segment(doc)


def word_detokenize(
segments: Union[List[List[str]], List[str]], output: str = "str"
) -> Union[List[str], str]:
Expand Down
77 changes: 0 additions & 77 deletions pythainlp/tokenize/crfcls.py

This file was deleted.

7 changes: 0 additions & 7 deletions tests/extra/testx_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
tltk,
word_tokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize

from ..core.test_tokenize import (
SENT_1,
Expand All @@ -31,12 +30,6 @@
)


class ClauseTokenizeTestCase(unittest.TestCase):
def test_clause_tokenize(self):
self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)


class DetokenizeTestCase(unittest.TestCase):
def test_numeric_data_format(self):
engines = ["attacut", "deepcut", "sefr_cut"]
Expand Down

0 comments on commit 2287b30

Please sign in to comment.