PyThaiNLP · wannaphong · Jul 1, 2023 · Jun 21, 2023 · Jun 21, 2023 · Jun 23, 2023
diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml
@@ -19,6 +19,8 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
+      env:
+        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
       run: |
         python -m pip install --upgrade pip
         pip install deepcut tltk

diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml
@@ -41,6 +41,8 @@ jobs:
         pip install torch==1.8.1
     - name: Install dependencies
       shell: powershell
+      env:
+        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
       run: |
         python -m pip install --disable-pip-version-check --user --upgrade pip setuptools
         python -m pip install backports.zoneinfo[tzdata]

diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -36,3 +36,4 @@ esupar==1.3.8
 ufal.chu-liu-edmonds==1.0.2
 wtpsplit==1.0.1
 fastcoref==2.1.6
+panphon==0.20.0
diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst
@@ -12,6 +12,9 @@ Modules
 .. autofunction:: udom83
 .. autofunction:: metasound
 .. autofunction:: prayut_and_somchaip
+.. autofunction:: pythainlp.soundex.sound.word_approximation
+.. autofunction:: pythainlp.soundex.sound.audio_vector
+.. autofunction:: pythainlp.soundex.sound.word2audio
 
 References
 ----------

diff --git a/pythainlp/soundex/sound.py b/pythainlp/soundex/sound.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from pythainlp.transliterate import pronunciate, transliterate
+from pythainlp.tokenize import word_tokenize
+
+import panphon
+import panphon.distance
+
+_ft = panphon.FeatureTable()
+_dst = panphon.distance.Distance()
+
+def _clean_ipa(ipa: str) -> str:
+    """
+    Clean IPA by remove tone and remove space between phone
+
+    :param str ipa: IPA text
+    :return: IPA that remove tone from the text
+    :rtype: str
+    """
+    return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()
+
+def word2audio(word: str) -> str:
+    """
+    Convert word to IPA
+
+    :param str word: Thai word
+    :return: IPA that remove tone from the text
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.soundex.sound import word2audio
+
+        word2audio("น้ำ")
+        # output : 'n aː m .'
+    """
+    _word = word_tokenize(word)
+    _phone = [pronunciate(w, engine="w2p") for w in _word]
+    _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
+    return '.'.join(_ipa)
+
+def audio_vector(word:str) -> List[List[int]]:
+    """
+    Convert audio to vector list
+
+    :param str word: Thai word
+    :return: List feature from panphon
+    :rtype: List[List[int]]
+
+    :Example:
+    ::
+
+        from pythainlp.soundex.sound import audio_vector
+
+        audio_vector("น้ำ")
+        # output : [[-1, 1, 1, -1, -1, -1, ...]]
+    """
+    return _ft.word_to_vector_list(word2audio(word), numeric=True)
+
+def word_approximation(word:str, list_word:List[str]):
+    """
+    Thai Word Approximation
+
+    :param str word: Thai word
+    :param str list_word: Thai word
+    :return: List of approximation of word (The smaller the value, the closer)
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.soundex.sound import word_approximation
+
+        word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
+        # output : [0.0, 0.0, 3.875, 8.375]
+    """
+    _word = word2audio(word)
+    _list_word = [word2audio(w) for w in list_word]
+    _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
+    return _distance
diff --git a/setup.py b/setup.py
@@ -114,6 +114,9 @@
         "spacy>=3.0",
         "fastcoref>=2.1.5",
     },
+    "word_approximation":{
+        "panphon>=0.20.0"
+    },
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -146,6 +149,7 @@
         "spacy>=3.0",
         "fastcoref>=2.1.5",
         "ufal.chu-liu-edmonds>=1.0.2",
+        "panphon>=0.20.0",
     ],
 }
 

diff --git a/tests/test_soundex.py b/tests/test_soundex.py
@@ -3,6 +3,7 @@
 import unittest
 
 from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
+from pythainlp.soundex.sound import word_approximation, audio_vector
 
 
 class TestSoundexPackage(unittest.TestCase):
@@ -73,3 +74,9 @@ def test_soundex(self):
         self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
         self.assertIsNotNone(prayut_and_somchaip("กาง"))
         self.assertIsNotNone(prayut_and_somchaip("ว้าว"))
+
+    def test_word_approximation(self):
+        self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"]))
+
+    def test_audio_vector(self):
+        self.assertIsNotNone(audio_vector("คน"))