PyThaiNLP · wannaphong · Jul 14, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -11,26 +11,29 @@ Modules
 .. autofunction:: bahttext
 .. autofunction:: convert_years
 .. autofunction:: collate
+.. autofunction:: count_thai_chars
+.. autofunction:: countthai
 .. autofunction:: dict_trie
 .. autofunction:: digit_to_text
 .. autofunction:: display_thai_char
 .. autofunction:: emoji_to_thai
 .. autofunction:: eng_to_thai
 .. autofunction:: find_keyword
-.. autofunction:: countthai
-.. autofunction:: count_thai_chars
+.. autofunction:: ipa_to_rtgs
 .. autofunction:: is_native_thai
 .. autofunction:: isthai
 .. autofunction:: isthaichar
+.. autofunction:: maiyamok
+.. autofunction:: nectec_to_ipa
 .. autofunction:: normalize
 .. autofunction:: now_reign_year
 .. autofunction:: num_to_thaiword
-.. autofunction:: maiyamok
 .. autofunction:: rank
 .. autofunction:: reign_year_to_ad
 .. autofunction:: remove_dangling
 .. autofunction:: remove_dup_spaces
 .. autofunction:: remove_repeat_vowels
+.. autofunction:: remove_tone_ipa
 .. autofunction:: remove_tonemark
 .. autofunction:: remove_zw
 .. autofunction:: reorder_vowels
@@ -40,20 +43,19 @@ Modules
 .. autofunction:: text_to_arabic_digit
 .. autofunction:: text_to_num
 .. autofunction:: text_to_thai_digit
+.. autofunction:: thai_digit_to_arabic_digit
 .. autofunction:: thai_strftime
 .. autofunction:: thai_strptime
 .. autofunction:: thai_to_eng
 .. autofunction:: thai_word_tone_detector
-.. autofunction:: thai_digit_to_arabic_digit
 .. autofunction:: thaiword_to_date
 .. autofunction:: thaiword_to_num
 .. autofunction:: thaiword_to_time
 .. autofunction:: time_to_thaiword
 .. autofunction:: tis620_to_utf8
 .. autofunction:: tone_detector
 .. autofunction:: words_to_num
-.. autofunction:: nectec_to_ipa
-.. autofunction:: ipa_to_rtgs
-.. autofunction:: remove_tone_ipa
+.. autofunction:: pythainlp.util.spell_words.spell_syllable
+.. autofunction:: pythainlp.util.spell_words.spell_word
 .. autoclass:: Trie
    :members:
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -66,6 +66,7 @@
     "ipa_to_rtgs",
     "remove_tone_ipa",
     "tis620_to_utf8",
+    "spell_words",
 ]
 
 from pythainlp.util.collate import collate
@@ -123,3 +124,4 @@
 )
 from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
 from pythainlp.util.encoding import tis620_to_utf8
+import pythainlp.util.spell_words as spell_words
diff --git a/pythainlp/util/spell_words.py b/pythainlp/util/spell_words.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List
+from pythainlp import (
+    thai_letters,
+    thai_consonants,
+    thai_lead_vowels,
+    thai_follow_vowels,
+    thai_above_vowels,
+    thai_below_vowels,
+    thai_tonemarks
+)
+from pythainlp.tokenize import Tokenizer
+from pythainlp.tokenize import subword_tokenize
+
+
+_r1=["เ-ย","เ-ะ","แ-ะ","โ-ะ","เ-าะ","เ-อะ","เ-อ","เ-า"]
+_r2=["–ั:วะ","เ–ี:ยะ","เ–ือะ","–ั:ว","เ–ี:ย","เ–ื:อ","–ื:อ"]
+tonemarks={i:"ไม้"+j for i,j in zip(list(thai_tonemarks),["เอก","โท","ตรี","จัตวา"])}
+
+rule1=[i.replace("-",f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
+rule2=[i.replace("–",f"([{thai_letters}])").replace(":",f"") for i in _r2]
+rule3=[i.replace("–",f"([{thai_letters}])").replace(":",f"([{thai_tonemarks}])") for i in _r2]
+dict_vowel_ex={}
+for i in _r1+_r2:
+    dict_vowel_ex[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ")
+dict_vowel={}
+for i in _r1+_r2:
+    dict_vowel[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ")
+for i in thai_lead_vowels:
+    dict_vowel[i]=i+"อ"
+for i in thai_follow_vowels:
+    dict_vowel[i]="อ"+i
+for i in thai_above_vowels:
+    dict_vowel[i]="อ"+i
+for i in thai_below_vowels:
+    dict_vowel[i]="อ"+i
+
+_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants),engine="mm")
+
+
+def _clean(w):
+    if bool(re.match('|'.join(rule3),w)):
+        for r in rule3:
+            if bool(re.match(r,w)):
+                _w=re.sub(r,"\\1==\\2==",w)
+                _temp=_w.split("==")
+                w=_temp[0]+r.replace(f"([{thai_letters}])","อ").replace(f"([{thai_tonemarks}])","")+_temp[1]
+    elif bool(re.match('|'.join(rule2),w)):
+        for r in rule2:
+            if bool(re.match(r,w)):
+                w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}])","อ")
+    elif bool(re.match('|'.join(rule1),w)):
+        for r in rule1:
+            if bool(re.match(r,w)):
+                w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)","อ")
+    return w
+
+
+def spell_syllable(s: str)-> List[str]:
+    """
+    Spell syllable by Thai word distribution form.
+
+    :param str s: Thai syllable only
+    :return: List of spell syllable
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util.spell_words import spell_syllable
+
+        print(spell_syllable("แมว"))
+        # output: ['มอ', 'วอ', 'แอ', 'แมว']
+    """
+    _t=s
+    s=_cut.word_tokenize(_clean(s))
+    _c_only = [i+"อ" for i in s if i in set(thai_consonants)]
+    _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel.keys())]
+    _t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())]
+    _out=_c_only+_v_only+_t_only
+    _out.append(_t)
+    return _out
+
+
+def spell_word(w: str)-> List[str]:
+    """
+    Spell word by Thai word distribution form.
+
+    :param str w: Thai word only
+    :return: List of spell word
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util.spell_words import spell_word
+
+        print(spell_word("คนดี"))
+        # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']
+    """
+    _r=[]
+    _temp=subword_tokenize(w,engine="ssg")
+    for i in _temp:
+        _r.extend(spell_syllable(i))
+    if len(_temp)>1:
+        _r.append(w)
+    return _r
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -59,6 +59,7 @@
     remove_tone_ipa,
     tis620_to_utf8,
 )
+from pythainlp.util.spell_words import spell_word
 
 
 class TestUtilPackage(unittest.TestCase):
@@ -844,3 +845,9 @@ def test_remove_tone_ipa(self):
 
     def test_tis620_to_utf8(self):
         self.assertEqual(tis620_to_utf8("¡ÃÐ·ÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")
+
+    def test_spell_word(self):
+        self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ'])
+        self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
+        self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
+        self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])