Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add word approximation to pythainlp.soundex.sound #809

Merged
merged 3 commits into from
Jul 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/pypi-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
env:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: |
python -m pip install --upgrade pip
pip install deepcut tltk
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/windows-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ jobs:
pip install torch==1.8.1
- name: Install dependencies
shell: powershell
env:
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL: True
run: |
python -m pip install --disable-pip-version-check --user --upgrade pip setuptools
python -m pip install backports.zoneinfo[tzdata]
Expand Down
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
wtpsplit==1.0.1
fastcoref==2.1.6
panphon==0.20.0
3 changes: 3 additions & 0 deletions docs/api/soundex.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Modules
.. autofunction:: udom83
.. autofunction:: metasound
.. autofunction:: prayut_and_somchaip
.. autofunction:: pythainlp.soundex.sound.word_approximation
.. autofunction:: pythainlp.soundex.sound.audio_vector
.. autofunction:: pythainlp.soundex.sound.word2audio

References
----------
Expand Down
94 changes: 94 additions & 0 deletions pythainlp/soundex/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pythainlp.transliterate import pronunciate, transliterate
from pythainlp.tokenize import word_tokenize

import panphon
import panphon.distance

_ft = panphon.FeatureTable()
_dst = panphon.distance.Distance()

def _clean_ipa(ipa: str) -> str:
"""
Clean IPA by remove tone and remove space between phone

:param str ipa: IPA text
:return: IPA that remove tone from the text
:rtype: str
"""
return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()

def word2audio(word: str) -> str:
"""
Convert word to IPA

:param str word: Thai word
:return: IPA that remove tone from the text
:rtype: str

:Example:
::

from pythainlp.soundex.sound import word2audio

word2audio("น้ำ")
# output : 'n aː m .'
"""
_word = word_tokenize(word)
_phone = [pronunciate(w, engine="w2p") for w in _word]
_ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
return '.'.join(_ipa)

def audio_vector(word:str) -> List[List[int]]:
"""
Convert audio to vector list

:param str word: Thai word
:return: List feature from panphon
:rtype: List[List[int]]

:Example:
::

from pythainlp.soundex.sound import audio_vector

audio_vector("น้ำ")
# output : [[-1, 1, 1, -1, -1, -1, ...]]
"""
return _ft.word_to_vector_list(word2audio(word), numeric=True)

def word_approximation(word:str, list_word:List[str]):
"""
Thai Word Approximation

:param str word: Thai word
:param str list_word: Thai word
:return: List of approximation of word (The smaller the value, the closer)
:rtype: List[str]

:Example:
::

from pythainlp.soundex.sound import word_approximation

word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
# output : [0.0, 0.0, 3.875, 8.375]
"""
_word = word2audio(word)
_list_word = [word2audio(w) for w in list_word]
_distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
return _distance
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
},
"word_approximation":{
"panphon>=0.20.0"
},
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand Down Expand Up @@ -146,6 +149,7 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
"ufal.chu-liu-edmonds>=1.0.2",
"panphon>=0.20.0",
],
}

Expand Down
7 changes: 7 additions & 0 deletions tests/test_soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
from pythainlp.soundex.sound import word_approximation, audio_vector


class TestSoundexPackage(unittest.TestCase):
Expand Down Expand Up @@ -73,3 +74,9 @@ def test_soundex(self):
self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
self.assertIsNotNone(prayut_and_somchaip("กาง"))
self.assertIsNotNone(prayut_and_somchaip("ว้าว"))

def test_word_approximation(self):
self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"]))

def test_audio_vector(self):
self.assertIsNotNone(audio_vector("คน"))