diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index d494c5d95..ba9e9f6fa 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -19,6 +19,8 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install dependencies + env: + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | python -m pip install --upgrade pip pip install deepcut tltk diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml index d5b54f959..8b6942b91 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/windows-test.yml @@ -41,6 +41,8 @@ jobs: pip install torch==1.8.1 - name: Install dependencies shell: powershell + env: + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True run: | python -m pip install --disable-pip-version-check --user --upgrade pip setuptools python -m pip install backports.zoneinfo[tzdata] diff --git a/docker_requirements.txt b/docker_requirements.txt index 72fe9e02e..4cd8b63f9 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -36,3 +36,4 @@ esupar==1.3.8 ufal.chu-liu-edmonds==1.0.2 wtpsplit==1.0.1 fastcoref==2.1.6 +panphon==0.20.0 diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst index 3c8915f24..139fadd02 100644 --- a/docs/api/soundex.rst +++ b/docs/api/soundex.rst @@ -12,6 +12,9 @@ Modules .. autofunction:: udom83 .. autofunction:: metasound .. autofunction:: prayut_and_somchaip +.. autofunction:: pythainlp.soundex.sound.word_approximation +.. autofunction:: pythainlp.soundex.sound.audio_vector +.. autofunction:: pythainlp.soundex.sound.word2audio References ---------- diff --git a/pythainlp/soundex/sound.py b/pythainlp/soundex/sound.py new file mode 100644 index 000000000..a6629c95e --- /dev/null +++ b/pythainlp/soundex/sound.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List +from pythainlp.transliterate import pronunciate, transliterate +from pythainlp.tokenize import word_tokenize + +import panphon +import panphon.distance + +_ft = panphon.FeatureTable() +_dst = panphon.distance.Distance() + +def _clean_ipa(ipa: str) -> str: + """ + Clean IPA by remove tone and remove space between phone + + :param str ipa: IPA text + :return: IPA that remove tone from the text + :rtype: str + """ + return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip() + +def word2audio(word: str) -> str: + """ + Convert word to IPA + + :param str word: Thai word + :return: IPA that remove tone from the text + :rtype: str + + :Example: + :: + + from pythainlp.soundex.sound import word2audio + + word2audio("น้ำ") + # output : 'n aː m .' + """ + _word = word_tokenize(word) + _phone = [pronunciate(w, engine="w2p") for w in _word] + _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone] + return '.'.join(_ipa) + +def audio_vector(word:str) -> List[List[int]]: + """ + Convert audio to vector list + + :param str word: Thai word + :return: List feature from panphon + :rtype: List[List[int]] + + :Example: + :: + + from pythainlp.soundex.sound import audio_vector + + audio_vector("น้ำ") + # output : [[-1, 1, 1, -1, -1, -1, ...]] + """ + return _ft.word_to_vector_list(word2audio(word), numeric=True) + +def word_approximation(word:str, list_word:List[str]): + """ + Thai Word Approximation + + :param str word: Thai word + :param str list_word: Thai word + :return: List of approximation of word (The smaller the value, the closer) + :rtype: List[str] + + :Example: + :: + + from pythainlp.soundex.sound import word_approximation + + word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"]) + # output : [0.0, 0.0, 3.875, 8.375] + """ + _word = word2audio(word) + _list_word = [word2audio(w) for w in list_word] + _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word] + return _distance diff --git a/setup.py b/setup.py index 10ca6b107..e9e540732 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,9 @@ "spacy>=3.0", "fastcoref>=2.1.5", }, + "word_approximation":{ + "panphon>=0.20.0" + }, "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -146,6 +149,7 @@ "spacy>=3.0", "fastcoref>=2.1.5", "ufal.chu-liu-edmonds>=1.0.2", + "panphon>=0.20.0", ], } diff --git a/tests/test_soundex.py b/tests/test_soundex.py index b012afc74..7bf00af7d 100644 --- a/tests/test_soundex.py +++ b/tests/test_soundex.py @@ -3,6 +3,7 @@ import unittest from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip +from pythainlp.soundex.sound import word_approximation, audio_vector class TestSoundexPackage(unittest.TestCase): @@ -73,3 +74,9 @@ def test_soundex(self): self.assertIsNotNone(prayut_and_somchaip("ณาญ")) self.assertIsNotNone(prayut_and_somchaip("กาง")) self.assertIsNotNone(prayut_and_somchaip("ว้าว")) + + def test_word_approximation(self): + self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"])) + + def test_audio_vector(self): + self.assertIsNotNone(audio_vector("คน"))