From de5d0fa4ac487f5ee6da9b27c470a7f4d29aa4ad Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 11 Jul 2023 21:30:36 +0700 Subject: [PATCH 1/2] Add pythainlp.util.spell_words --- docs/api/util.rst | 2 + pythainlp/util/__init__.py | 2 + pythainlp/util/spell_words.py | 121 ++++++++++++++++++++++++++++++++++ tests/test_util.py | 7 ++ 4 files changed, 132 insertions(+) create mode 100644 pythainlp/util/spell_words.py diff --git a/docs/api/util.rst b/docs/api/util.rst index 491c7579e..5ee2addd9 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -55,5 +55,7 @@ Modules .. autofunction:: nectec_to_ipa .. autofunction:: ipa_to_rtgs .. autofunction:: remove_tone_ipa +.. autofunction:: pythainlp.util.spell_words.spell_word +.. autofunction:: pythainlp.util.spell_words.spell_syllable .. autoclass:: Trie :members: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index c468251ac..fafcf0459 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -66,6 +66,7 @@ "ipa_to_rtgs", "remove_tone_ipa", "tis620_to_utf8", + "spell_words", ] from pythainlp.util.collate import collate @@ -123,3 +124,4 @@ ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa from pythainlp.util.encoding import tis620_to_utf8 +import pythainlp.util.spell_words as spell_words diff --git a/pythainlp/util/spell_words.py b/pythainlp/util/spell_words.py new file mode 100644 index 000000000..9344df9f4 --- /dev/null +++ b/pythainlp/util/spell_words.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import List +from pythainlp import ( + thai_letters, + thai_consonants, + thai_lead_vowels, + thai_follow_vowels, + thai_above_vowels, + thai_below_vowels, + thai_tonemarks +) +from pythainlp.tokenize import Tokenizer +from pythainlp.tokenize import subword_tokenize + + +_r1=["เ-ย","เ-ะ","แ-ะ","โ-ะ","เ-าะ","เ-อะ","เ-อ","เ-า"] +_r2=["–ั:วะ","เ–ี:ยะ","เ–ือะ","–ั:ว","เ–ี:ย","เ–ื:อ","–ื:อ"] +tonemarks={i:"ไม้"+j for i,j in zip(list(thai_tonemarks),["เอก","โท","ตรี","จัตวา"])} + +rule1=[i.replace("-",f"([{thai_letters}](thai_tonemarks)?)") for i in _r1] +rule2=[i.replace("–",f"([{thai_letters}])").replace(":",f"") for i in _r2] +rule3=[i.replace("–",f"([{thai_letters}])").replace(":",f"([{thai_tonemarks}])") for i in _r2] +dict_vowel_ex={} +for i in _r1+_r2: + dict_vowel_ex[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ") +dict_vowel={} +for i in _r1+_r2: + dict_vowel[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ") +for i in thai_lead_vowels: + dict_vowel[i]=i+"อ" +for i in thai_follow_vowels: + dict_vowel[i]="อ"+i +for i in thai_above_vowels: + dict_vowel[i]="อ"+i +for i in thai_below_vowels: + dict_vowel[i]="อ"+i + +_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants),engine="mm") + + +def _clean(w): + if bool(re.match('|'.join(rule3),w)): + for r in rule3: + if bool(re.match(r,w)): + _w=re.sub(r,"\\1==\\2==",w) + _temp=_w.split("==") + w=_temp[0]+r.replace(f"([{thai_letters}])","อ").replace(f"([{thai_tonemarks}])","")+_temp[1] + elif bool(re.match('|'.join(rule2),w)): + for r in rule2: + if bool(re.match(r,w)): + w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}])","อ") + elif bool(re.match('|'.join(rule1),w)): + for r in rule1: + if bool(re.match(r,w)): + w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)","อ") + return w + + +def spell_syllable(s: str)-> List[str]: + """ + Spell syllable by Thai word distribution form. + + :param str s: Thai syllable only + :return: List of spell syllable + :rtype: List[str] + + :Example: + :: + + from pythainlp.util.spell_words import spell_syllable + + print(spell_syllable("แมว")) + # output: ['มอ', 'วอ', 'แอ', 'แมว'] + """ + _t=s + s=_cut.word_tokenize(_clean(s)) + _c_only = [i+"อ" for i in s if i in set(thai_consonants)] + _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel.keys())] + _t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())] + _out=_c_only+_v_only+_t_only + _out.append(_t) + return _out + + +def spell_word(w: str)-> List[str]: + """ + Spell word by Thai word distribution form. + + :param str w: Thai word only + :return: List of spell word + :rtype: List[str] + + :Example: + :: + + from pythainlp.util.spell_words import spell_word + + print(spell_word("คนดี")) + # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'] + """ + _r=[] + _temp=subword_tokenize(w,engine="ssg") + for i in _temp: + _r.extend(spell_syllable(i)) + if len(_temp)>1: + _r.append(w) + return _r \ No newline at end of file diff --git a/tests/test_util.py b/tests/test_util.py index 34d12c06a..9b6ae336b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -59,6 +59,7 @@ remove_tone_ipa, tis620_to_utf8, ) +from pythainlp.util.spell_words import spell_word class TestUtilPackage(unittest.TestCase): @@ -844,3 +845,9 @@ def test_remove_tone_ipa(self): def test_tis620_to_utf8(self): self.assertEqual(tis620_to_utf8("¡ÃзÃǧÍصÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม") + + def test_spell_word(self): + self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ']) + self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) + self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน']) + self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) From 4109d687617e4a53a19e1a134a48de4a533f8420 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 11 Jul 2023 16:34:08 +0100 Subject: [PATCH 2/2] Sort autofunction --- docs/api/util.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index 5ee2addd9..4023218ca 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -11,26 +11,29 @@ Modules .. autofunction:: bahttext .. autofunction:: convert_years .. autofunction:: collate +.. autofunction:: count_thai_chars +.. autofunction:: countthai .. autofunction:: dict_trie .. autofunction:: digit_to_text .. autofunction:: display_thai_char .. autofunction:: emoji_to_thai .. autofunction:: eng_to_thai .. autofunction:: find_keyword -.. autofunction:: countthai -.. autofunction:: count_thai_chars +.. autofunction:: ipa_to_rtgs .. autofunction:: is_native_thai .. autofunction:: isthai .. autofunction:: isthaichar +.. autofunction:: maiyamok +.. autofunction:: nectec_to_ipa .. autofunction:: normalize .. autofunction:: now_reign_year .. autofunction:: num_to_thaiword -.. autofunction:: maiyamok .. autofunction:: rank .. autofunction:: reign_year_to_ad .. autofunction:: remove_dangling .. autofunction:: remove_dup_spaces .. autofunction:: remove_repeat_vowels +.. autofunction:: remove_tone_ipa .. autofunction:: remove_tonemark .. autofunction:: remove_zw .. autofunction:: reorder_vowels @@ -40,11 +43,11 @@ Modules .. autofunction:: text_to_arabic_digit .. autofunction:: text_to_num .. autofunction:: text_to_thai_digit +.. autofunction:: thai_digit_to_arabic_digit .. autofunction:: thai_strftime .. autofunction:: thai_strptime .. autofunction:: thai_to_eng .. autofunction:: thai_word_tone_detector -.. autofunction:: thai_digit_to_arabic_digit .. autofunction:: thaiword_to_date .. autofunction:: thaiword_to_num .. autofunction:: thaiword_to_time @@ -52,10 +55,7 @@ Modules .. autofunction:: tis620_to_utf8 .. autofunction:: tone_detector .. autofunction:: words_to_num -.. autofunction:: nectec_to_ipa -.. autofunction:: ipa_to_rtgs -.. autofunction:: remove_tone_ipa -.. autofunction:: pythainlp.util.spell_words.spell_word .. autofunction:: pythainlp.util.spell_words.spell_syllable +.. autofunction:: pythainlp.util.spell_words.spell_word .. autoclass:: Trie :members: