Skip to content

Commit

Permalink
Merge pull request #511 from PyThaiNLP/Add-thai-word2phoneme
Browse files Browse the repository at this point in the history
Add Thai W2P
  • Loading branch information
wannaphong authored Jan 7, 2021
2 parents 49eee62 + d25fe08 commit 17891a3
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 25 deletions.
1 change: 1 addition & 0 deletions docs/api/transliterate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Modules

.. autofunction:: romanize
.. autofunction:: transliterate
.. autofunction:: pronunciate

Romanize Engines
----------------
Expand Down
3 changes: 2 additions & 1 deletion pythainlp/transliterate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
__all__ = [
"romanize",
"transliterate",
"pronunciate"
]

from pythainlp.transliterate.core import romanize, transliterate
from pythainlp.transliterate.core import romanize, transliterate, pronunciate
70 changes: 52 additions & 18 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

DEFAULT_ROMANIZE_ENGINE = "royin"
DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
DEFAULT_PRONUNCIATE_ENGINE = "w2p"


def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
Expand Down Expand Up @@ -46,7 +47,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:

if engine == "thai2rom":
from .thai2rom import romanize
else: # use default engine "royin"
else: # use default engine: "royin"
from .royin import romanize

return romanize(text)
Expand All @@ -59,51 +60,84 @@ def transliterate(
This function transliterates Thai text.
:param str text: Thai text to be transliterated
:param str engine: 'icu', 'ipa' (default), or 'thaig2p'
:param str engine: 'icu', 'ipa', or 'thaig2p' (default)
:return: A string of phonetic alphabets indicating
how the input text should be pronounced.
:rtype: str
:Options for engines:
* *icu* - International Components for Unicode (ICU)
* *ipa* - International Phonetic Alphabet (IPA) by epitran
* *thaig2p* - (default) Thai Grapheme to Phoneme by deep learning
output is International Phonetic Alphabet (IPA)
(require PyTorch)
* *icu* - pyicu, based on International Components for Unicode (ICU)
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
* *thaig2p* - (default) Thai Grapheme-to-Phoneme,
output is IPA (require PyTorch)
:Example:
::
from pythainlp.transliterate import transliterate
transliterate("สามารถ", engine="thaig2p")
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
transliterate("สามารถ", engine="icu")
# output: 's̄āmārt̄h'
transliterate("สามารถ", engine="ipa")
# output: 'saːmaːrot'
transliterate("สามารถ", engine="icu")
# output: 's̄āmārt̄h'
transliterate("สามารถ", engine="thaig2p")
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
transliterate("ภาพยนตร์", engine="thaig2p")
# output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
transliterate("ภาพยนตร์", engine="icu")
# output: 'p̣hāphyntr̒'
transliterate("ภาพยนตร์", engine="ipa")
# output: 'pʰaːpjanot'
transliterate("ภาพยนตร์", engine="icu")
# output: 'p̣hāphyntr̒'
transliterate("ภาพยนตร์", engine="thaig2p")
# output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
"""

if not text or not isinstance(text, str):
return ""

if engine == "icu" or engine == "pyicu":
from .pyicu import transliterate
elif engine == "thaig2p":
from .thaig2p import transliterate
else:
elif engine == "ipa":
from .ipa import transliterate
else: # use default engine: "thaig2p"
from .thaig2p import transliterate

return transliterate(text)


def pronunciate(word: str, engine: str = DEFAULT_PRONUNCIATE_ENGINE) -> str:
"""
This function pronunciates Thai word.
:param str text: Thai text to be pronunciated
:param str engine: 'w2p' (default)
:return: A string of Thai letters indicating
how the input text should be pronounced.
:rtype: str
:Options for engines:
* *w2p* - Thai Word-to-Phoneme
:Example:
::
from pythainlp.transliterate import pronunciate
pronunciate("สามารถ", engine="w2p")
# output: 'สา-มาด'
pronunciate("ภาพยนตร์", engine="w2p")
# output: 'พาบ-พะ-ยน'
"""
if not word or not isinstance(word, str):
return ""

# if engine == "w2p": # has only one engine
from .w2p import pronunciate

return pronunciate(word)
9 changes: 4 additions & 5 deletions pythainlp/transliterate/thaig2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@


class ThaiG2P:
def __init__(self):
"""
Transliteration of Thai words.
"""
Latin transliteration of Thai words, using International Phonetic Alphabet
"""

Now supports Thai to Latin (romanization)
"""
def __init__(self):
# get the model, will download if it's not available locally
self.__model_filename = get_corpus_path(_MODEL_NAME)

Expand Down
205 changes: 205 additions & 0 deletions pythainlp/transliterate/w2p.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
"""
Thai Word-to-Phoneme (Thai W2P)
GitHub : https://github.com/wannaphong/Thai_W2P
"""

import codecs
import os
import re
from typing import Union

import numpy as np
from pythainlp.corpus import download, get_corpus_path

_GRAPHEMES = list(
"พจใงต้ืฮแาฐฒฤๅูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+ " ณิฑชฉซทรฏฬํัฃวก่ป์ผฆบี๊ธญฌษะไ๋นโภ?"
)
_PHONEMES = list(
"-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+ " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?"
)

_MODEL_NAME = "thai_w2p"


class _Hparams:
batch_size = 256
enc_maxlen = 30 * 2
dec_maxlen = 40 * 2
num_epochs = 50 * 2
hidden_units = 64 * 8
emb_units = 64 * 4
graphemes = ["<pad>", "<unk>", "</s>"] + _GRAPHEMES
phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + _PHONEMES
lr = 0.001


hp = _Hparams()


def _load_vocab():
g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
idx2g = {idx: g for idx, g in enumerate(hp.graphemes)}

p2idx = {p: idx for idx, p in enumerate(hp.phonemes)}
idx2p = {idx: p for idx, p in enumerate(hp.phonemes)}
# note that g and p mean grapheme and phoneme, respectively.
return g2idx, idx2g, p2idx, idx2p


class Thai_W2P(object):
def __init__(self):
super().__init__()
self.graphemes = hp.graphemes
self.phonemes = hp.phonemes
self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
self.checkpoint = get_corpus_path(_MODEL_NAME)
if self.checkpoint is None:
download(_MODEL_NAME)
self.checkpoint = get_corpus_path(_MODEL_NAME)
self._load_variables()

def _load_variables(self):
self.variables = np.load(self.checkpoint, allow_pickle=True)
# (29, 64). (len(graphemes), emb)
self.enc_emb = self.variables.item().get("encoder.emb.weight")
# (3*128, 64)
self.enc_w_ih = self.variables.item().get("encoder.rnn.weight_ih_l0")
# (3*128, 128)
self.enc_w_hh = self.variables.item().get("encoder.rnn.weight_hh_l0")
# (3*128,)
self.enc_b_ih = self.variables.item().get("encoder.rnn.bias_ih_l0")
# (3*128,)
self.enc_b_hh = self.variables.item().get("encoder.rnn.bias_hh_l0")

# (74, 64). (len(phonemes), emb)
self.dec_emb = self.variables.item().get("decoder.emb.weight")
# (3*128, 64)
self.dec_w_ih = self.variables.item().get("decoder.rnn.weight_ih_l0")
# (3*128, 128)
self.dec_w_hh = self.variables.item().get("decoder.rnn.weight_hh_l0")
# (3*128,)
self.dec_b_ih = self.variables.item().get("decoder.rnn.bias_ih_l0")
# (3*128,)
self.dec_b_hh = self.variables.item().get("decoder.rnn.bias_hh_l0")
# (74, 128)
self.fc_w = self.variables.item().get("decoder.fc.weight")
# (74,)
self.fc_b = self.variables.item().get("decoder.fc.bias")

def _sigmoid(self, x):
return 1 / (1 + np.exp(-x))

def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
rzn_ih = np.matmul(x, w_ih.T) + b_ih
rzn_hh = np.matmul(h, w_hh.T) + b_hh

rz_ih, n_ih = (
rzn_ih[:, : rzn_ih.shape[-1] * 2 // 3],
rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:],
)
rz_hh, n_hh = (
rzn_hh[:, : rzn_hh.shape[-1] * 2 // 3],
rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:],
)

rz = self._sigmoid(rz_ih + rz_hh)
r, z = np.split(rz, 2, -1)

n = np.tanh(n_ih + r * n_hh)
h = (1 - z) * n + z * h

return h

def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None) -> np.ndarray:
if h0 is None:
h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
h = h0 # initial hidden state

outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
for t in range(steps):
h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h)
outputs[:, t, ::] = h

return outputs

def _encode(self, word: str) -> np.ndarray:
chars = list(word) + ["</s>"]
x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)

return x

def _short_word(self, word: str) -> Union[str, None]:
self.word = word
if self.word.endswith("."):
self.word = self.word.replace(".", "")
self.word = "-".join([i + "อ" for i in list(self.word)])
return self.word
return None

def _predict(self, word: str) -> str:
short_word = self._short_word(word)
if short_word is not None:
return short_word

# encoder
enc = self._encode(word)
enc = self._gru(
enc,
len(word) + 1,
self.enc_w_ih,
self.enc_w_hh,
self.enc_b_ih,
self.enc_b_hh,
h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32),
)
last_hidden = enc[:, -1, :]

# decoder
dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
h = last_hidden

preds = []
for _ in range(20):
h = self._grucell(
dec,
h,
self.dec_w_ih,
self.dec_w_hh,
self.dec_b_ih,
self.dec_b_hh,
) # (b, h)
logits = np.matmul(h, self.fc_w.T) + self.fc_b
pred = logits.argmax()
if pred == 3:
break
preds.append(pred)
dec = np.take(self.dec_emb, [pred], axis=0)

preds = [self.idx2p.get(idx, "<unk>") for idx in preds]

return preds

def __call__(self, word: str) -> str:
if not any(letter in word for letter in self.graphemes):
pron = [word]
else: # predict for oov
pron = self._predict(word)

return "".join(pron)


_THAI_W2P = Thai_W2P()


def pronunciate(text: str) -> str:
"""
Convert a Thai word to its pronunciation in Thai letters.
Input should be one single word.
"""
global _THAI_W2P
return _THAI_W2P(text)
10 changes: 9 additions & 1 deletion tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest

import torch
from pythainlp.transliterate import romanize, transliterate
from pythainlp.transliterate import romanize, transliterate, pronunciate
from pythainlp.transliterate.ipa import trans_list, xsampa_list
from pythainlp.transliterate.thai2rom import ThaiTransliterator

Expand Down Expand Up @@ -134,3 +134,11 @@ def test_transliterate(self):
self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
self.assertIsNotNone(trans_list("คน"))
self.assertIsNotNone(xsampa_list("คน"))

def test_pronunciate(self):
self.assertEqual(pronunciate(""), "")
self.assertIsNotNone(pronunciate("คน", engine="w2p"))
self.assertIsNotNone(pronunciate("แมว", engine="w2p"))
self.assertIsNotNone(pronunciate("มข.", engine="w2p"))
self.assertIsNotNone(pronunciate("มช.", engine="w2p"))
self.assertIsNotNone(pronunciate("jks", engine="w2p"))

0 comments on commit 17891a3

Please sign in to comment.