Skip to content

Commit

Permalink
Merge pull request #809 from PyThaiNLP/add-word_approximation
Browse files Browse the repository at this point in the history
Add word approximation to pythainlp.soundex.sound
  • Loading branch information
wannaphong authored Jul 1, 2023
2 parents 9d0e030 + 2313cb0 commit ee3beda
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/pypi-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
env:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: |
python -m pip install --upgrade pip
pip install deepcut tltk
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/windows-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ jobs:
pip install torch==1.8.1
- name: Install dependencies
shell: powershell
env:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: |
python -m pip install --disable-pip-version-check --user --upgrade pip setuptools
python -m pip install backports.zoneinfo[tzdata]
Expand Down
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
wtpsplit==1.0.1
fastcoref==2.1.6
panphon==0.20.0
3 changes: 3 additions & 0 deletions docs/api/soundex.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Modules
.. autofunction:: udom83
.. autofunction:: metasound
.. autofunction:: prayut_and_somchaip
.. autofunction:: pythainlp.soundex.sound.word_approximation
.. autofunction:: pythainlp.soundex.sound.audio_vector
.. autofunction:: pythainlp.soundex.sound.word2audio

References
----------
Expand Down
94 changes: 94 additions & 0 deletions pythainlp/soundex/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pythainlp.transliterate import pronunciate, transliterate
from pythainlp.tokenize import word_tokenize

import panphon
import panphon.distance

_ft = panphon.FeatureTable()
_dst = panphon.distance.Distance()

def _clean_ipa(ipa: str) -> str:
"""
Clean IPA by remove tone and remove space between phone
:param str ipa: IPA text
:return: IPA that remove tone from the text
:rtype: str
"""
return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()

def word2audio(word: str) -> str:
"""
Convert word to IPA
:param str word: Thai word
:return: IPA that remove tone from the text
:rtype: str
:Example:
::
from pythainlp.soundex.sound import word2audio
word2audio("น้ำ")
# output : 'n aː m .'
"""
_word = word_tokenize(word)
_phone = [pronunciate(w, engine="w2p") for w in _word]
_ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
return '.'.join(_ipa)

def audio_vector(word:str) -> List[List[int]]:
"""
Convert audio to vector list
:param str word: Thai word
:return: List feature from panphon
:rtype: List[List[int]]
:Example:
::
from pythainlp.soundex.sound import audio_vector
audio_vector("น้ำ")
# output : [[-1, 1, 1, -1, -1, -1, ...]]
"""
return _ft.word_to_vector_list(word2audio(word), numeric=True)

def word_approximation(word:str, list_word:List[str]):
"""
Thai Word Approximation
:param str word: Thai word
:param str list_word: Thai word
:return: List of approximation of word (The smaller the value, the closer)
:rtype: List[str]
:Example:
::
from pythainlp.soundex.sound import word_approximation
word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
# output : [0.0, 0.0, 3.875, 8.375]
"""
_word = word2audio(word)
_list_word = [word2audio(w) for w in list_word]
_distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
return _distance
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
},
"word_approximation":{
"panphon>=0.20.0"
},
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand Down Expand Up @@ -146,6 +149,7 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
"ufal.chu-liu-edmonds>=1.0.2",
"panphon>=0.20.0",
],
}

Expand Down
7 changes: 7 additions & 0 deletions tests/test_soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
from pythainlp.soundex.sound import word_approximation, audio_vector


class TestSoundexPackage(unittest.TestCase):
Expand Down Expand Up @@ -73,3 +74,9 @@ def test_soundex(self):
self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
self.assertIsNotNone(prayut_and_somchaip("กาง"))
self.assertIsNotNone(prayut_and_somchaip("ว้าว"))

def test_word_approximation(self):
self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"]))

def test_audio_vector(self):
self.assertIsNotNone(audio_vector("คน"))

0 comments on commit ee3beda

Please sign in to comment.