Skip to content

Commit

Permalink
Merge pull request #927 from ayaan-qadri/dev
Browse files Browse the repository at this point in the history
Added list of string support to sent_tokenize
  • Loading branch information
wannaphong authored Oct 28, 2024
2 parents 2875544 + 1a2b457 commit b11fe00
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 15 deletions.
115 changes: 100 additions & 15 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
import re
from typing import Iterable, List, Union
import copy

from pythainlp.tokenize import (
DEFAULT_SENT_TOKENIZE_ENGINE,
Expand Down Expand Up @@ -198,7 +199,7 @@ def word_tokenize(
word_tokenize(text, engine="newmm", keep_whitespace=False)
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
Join broken formatted numeric (e.g. time, decimals, IP addresses)::
text = "เงิน1,234บาท19:32น 127.0.0.1"
Expand Down Expand Up @@ -322,17 +323,50 @@ def word_tokenize(
return segments


def indices_words(words):
indices = []
start_index = 0
for word in words:
end_index = start_index + len(word) - 1
indices.append((start_index, end_index))
start_index += len(word)

return indices


def map_indices_to_words(index_list, sentences):
result = []
c = copy.copy(index_list)
n_sum = 0
for sentence in sentences:
words = sentence
sentence_result = []
n = 0
for start, end in c:
if start > n_sum + len(words) - 1:
break
else:
word = sentence[start - n_sum:end + 1 - n_sum]
sentence_result.append(word)
n += 1

result.append(sentence_result)
n_sum += len(words)
for _ in range(n):
del c[0]
return result

def sent_tokenize(
text: str,
text: Union[str, List[str]],
engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
) -> List[str]:
"""
Sentence tokenizer.
Tokenizes running text into "sentences"
Tokenizes running text into "sentences". Supports both string and list of strings.
:param str text: the text to be tokenized
:param text: the text (string) or list of words (list of strings) to be tokenized
:param str engine: choose among *'crfcut'*, *'whitespace'*, \
*'whitespace+newline'*
:return: list of split sentences
Expand Down Expand Up @@ -394,38 +428,84 @@ def sent_tokenize(
'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
"""

if not text or not isinstance(text, str):
if not text or not isinstance(text, (str, list)):
return []

is_list_input = isinstance(text, list)

if is_list_input:

try:
original_text = "".join(text)
except ValueError:
return []

else:
original_text = text

segments = []

if engine == "crfcut":
from pythainlp.tokenize.crfcut import segment

segments = segment(text)
segments = segment(original_text)

if is_list_input:
word_indices = indices_words(text)
result = map_indices_to_words(word_indices, [original_text])
return result
elif engine == "whitespace":
segments = re.split(r" +", text, flags=re.U)
segments = re.split(r" +", original_text, flags=re.U)
if is_list_input:
result = []
_temp = []
for i, w in enumerate(text):
if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []:
if _temp == []:
continue
result.append(_temp)
_temp = []
else:
_temp.append(w)
if i + 1 == len(text):
result.append(_temp)
return result
elif engine == "whitespace+newline":
segments = text.split()
segments = original_text.split()
if is_list_input:
result = []
_temp = []
for i, w in enumerate(text):
if (
(re.findall(r"\s", w) != [] or
re.findall(r"\n", w) != []) and
re.findall(r"\w", w) == []
):
if _temp == []:
continue
result.append(_temp)
_temp = []
else:
_temp.append(w)
if i + 1 == len(text):
result.append(_temp)
return result
elif engine == "tltk":
from pythainlp.tokenize.tltk import sent_tokenize as segment

segments = segment(text)
segments = segment(original_text)
elif engine == "thaisum":
from pythainlp.tokenize.thaisumcut import (
ThaiSentenceSegmentor as segmentor,
)

segment = segmentor()
segments = segment.split_into_sentences(text)
segments = segment.split_into_sentences(original_text)
elif engine.startswith("wtp"):
if "-" not in engine:
_size = "mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment

segments = segment(text, size=_size, tokenize="sentence")
segments = segment(original_text, size=_size, tokenize="sentence")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand All @@ -435,7 +515,12 @@ def sent_tokenize(
if not keep_whitespace:
segments = strip_whitespace(segments)

return segments
if is_list_input and engine not in ["crfcut"]:
word_indices = indices_words(text)
result = map_indices_to_words(word_indices, segments)
return result
else:
return [segments]


def paragraph_tokenize(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,23 @@ def test_sent_tokenize(self):
# engine="wtp-large",
# ),
# )
sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
self.assertEqual(
sent_tokenize(sent_4, engine="crfcut"),
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="whitespace"),
[["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="whitespace+newline"),
[["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="thaisum"),
[["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
)
self.assertFalse(
" "
in sent_tokenize(
Expand Down

0 comments on commit b11fe00

Please sign in to comment.