Merge pull request #511 from PyThaiNLP/Add-thai-word2phoneme

Add Thai W2P
PyThaiNLP · Jan 7, 2021 · 17891a3 · 17891a3
2 parents 49eee62 + d25fe08
commit 17891a3
Show file tree

Hide file tree

Showing 6 changed files with 273 additions and 25 deletions.
diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst
@@ -9,6 +9,7 @@ Modules
 
 .. autofunction:: romanize
 .. autofunction:: transliterate
+.. autofunction:: pronunciate
 
 Romanize Engines
 ----------------

diff --git a/pythainlp/transliterate/__init__.py b/pythainlp/transliterate/__init__.py
@@ -6,6 +6,7 @@
 __all__ = [
     "romanize",
     "transliterate",
+    "pronunciate"
 ]
 
-from pythainlp.transliterate.core import romanize, transliterate
+from pythainlp.transliterate.core import romanize, transliterate, pronunciate
diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py
@@ -2,6 +2,7 @@
 
 DEFAULT_ROMANIZE_ENGINE = "royin"
 DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
+DEFAULT_PRONUNCIATE_ENGINE = "w2p"
 
 
 def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
@@ -46,7 +47,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
 
     if engine == "thai2rom":
         from .thai2rom import romanize
-    else:  # use default engine "royin"
+    else:  # use default engine: "royin"
         from .royin import romanize
 
     return romanize(text)
@@ -59,51 +60,84 @@ def transliterate(
     This function transliterates Thai text.
 
     :param str text: Thai text to be transliterated
-    :param str engine: 'icu', 'ipa' (default), or 'thaig2p'
+    :param str engine: 'icu', 'ipa', or 'thaig2p' (default)
 
     :return: A string of phonetic alphabets indicating
              how the input text should be pronounced.
     :rtype: str
 
     :Options for engines:
-        * *icu* - International Components for Unicode (ICU)
-        * *ipa* - International Phonetic Alphabet (IPA) by epitran
-        * *thaig2p* - (default) Thai Grapheme to Phoneme by deep learning
-          output is International Phonetic Alphabet (IPA)
-          (require PyTorch)
+        * *icu* - pyicu, based on International Components for Unicode (ICU)
+        * *ipa* - epitran, output is International Phonetic Alphabet (IPA)
+        * *thaig2p* - (default) Thai Grapheme-to-Phoneme,
+          output is IPA (require PyTorch)
 
     :Example:
     ::
 
         from pythainlp.transliterate import transliterate
 
-        transliterate("สามารถ", engine="thaig2p")
-        # output: 's aː ˩˩˦ . m aː t̚ ˥˩'
+        transliterate("สามารถ", engine="icu")
+        # output: 's̄āmārt̄h'
 
         transliterate("สามารถ", engine="ipa")
         # output: 'saːmaːrot'
 
-        transliterate("สามารถ", engine="icu")
-        # output: 's̄āmārt̄h'
+        transliterate("สามารถ", engine="thaig2p")
+        # output: 's aː ˩˩˦ . m aː t̚ ˥˩'
 
-        transliterate("ภาพยนตร์", engine="thaig2p")
-        # output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
+        transliterate("ภาพยนตร์", engine="icu")
+        # output: 'p̣hāphyntr̒'
 
         transliterate("ภาพยนตร์", engine="ipa")
         # output: 'pʰaːpjanot'
 
-        transliterate("ภาพยนตร์", engine="icu")
-        # output: 'p̣hāphyntr̒'
+        transliterate("ภาพยนตร์", engine="thaig2p")
+        # output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
     """
 
     if not text or not isinstance(text, str):
         return ""
 
     if engine == "icu" or engine == "pyicu":
         from .pyicu import transliterate
-    elif engine == "thaig2p":
-        from .thaig2p import transliterate
-    else:
+    elif engine == "ipa":
         from .ipa import transliterate
+    else:  # use default engine: "thaig2p"
+        from .thaig2p import transliterate
 
     return transliterate(text)
+
+
+def pronunciate(word: str, engine: str = DEFAULT_PRONUNCIATE_ENGINE) -> str:
+    """
+    This function pronunciates Thai word.
+
+    :param str text: Thai text to be pronunciated
+    :param str engine: 'w2p' (default)
+
+    :return: A string of Thai letters indicating
+             how the input text should be pronounced.
+    :rtype: str
+
+    :Options for engines:
+        * *w2p* - Thai Word-to-Phoneme
+
+    :Example:
+    ::
+
+        from pythainlp.transliterate import pronunciate
+
+        pronunciate("สามารถ", engine="w2p")
+        # output: 'สา-มาด'
+
+        pronunciate("ภาพยนตร์", engine="w2p")
+        # output: 'พาบ-พะ-ยน'
+    """
+    if not word or not isinstance(word, str):
+        return ""
+
+    # if engine == "w2p":  # has only one engine
+    from .w2p import pronunciate
+
+    return pronunciate(word)
diff --git a/pythainlp/transliterate/thaig2p.py b/pythainlp/transliterate/thaig2p.py
@@ -18,12 +18,11 @@
 
 
 class ThaiG2P:
-    def __init__(self):
-        """
-        Transliteration of Thai words.
+    """
+    Latin transliteration of Thai words, using International Phonetic Alphabet
+    """
 
-        Now supports Thai to Latin (romanization)
-        """
+    def __init__(self):
         # get the model, will download if it's not available locally
         self.__model_filename = get_corpus_path(_MODEL_NAME)
 

diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+"""
+Thai Word-to-Phoneme (Thai W2P)
+GitHub : https://github.com/wannaphong/Thai_W2P
+"""
+
+import codecs
+import os
+import re
+from typing import Union
+
+import numpy as np
+from pythainlp.corpus import download, get_corpus_path
+
+_GRAPHEMES = list(
+    "พจใงต้ืฮแาฐฒฤๅูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+    + " ณิฑชฉซทรฏฬํัฃวก่ป์ผฆบี๊ธญฌษะไ๋นโภ?"
+)
+_PHONEMES = list(
+    "-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+    + " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?"
+)
+
+_MODEL_NAME = "thai_w2p"
+
+
+class _Hparams:
+    batch_size = 256
+    enc_maxlen = 30 * 2
+    dec_maxlen = 40 * 2
+    num_epochs = 50 * 2
+    hidden_units = 64 * 8
+    emb_units = 64 * 4
+    graphemes = ["<pad>", "<unk>", "</s>"] + _GRAPHEMES
+    phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + _PHONEMES
+    lr = 0.001
+
+
+hp = _Hparams()
+
+
+def _load_vocab():
+    g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
+    idx2g = {idx: g for idx, g in enumerate(hp.graphemes)}
+
+    p2idx = {p: idx for idx, p in enumerate(hp.phonemes)}
+    idx2p = {idx: p for idx, p in enumerate(hp.phonemes)}
+    # note that g and p mean grapheme and phoneme, respectively.
+    return g2idx, idx2g, p2idx, idx2p
+
+
+class Thai_W2P(object):
+    def __init__(self):
+        super().__init__()
+        self.graphemes = hp.graphemes
+        self.phonemes = hp.phonemes
+        self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
+        self.checkpoint = get_corpus_path(_MODEL_NAME)
+        if self.checkpoint is None:
+            download(_MODEL_NAME)
+            self.checkpoint = get_corpus_path(_MODEL_NAME)
+        self._load_variables()
+
+    def _load_variables(self):
+        self.variables = np.load(self.checkpoint, allow_pickle=True)
+        # (29, 64). (len(graphemes), emb)
+        self.enc_emb = self.variables.item().get("encoder.emb.weight")
+        # (3*128, 64)
+        self.enc_w_ih = self.variables.item().get("encoder.rnn.weight_ih_l0")
+        # (3*128, 128)
+        self.enc_w_hh = self.variables.item().get("encoder.rnn.weight_hh_l0")
+        # (3*128,)
+        self.enc_b_ih = self.variables.item().get("encoder.rnn.bias_ih_l0")
+        # (3*128,)
+        self.enc_b_hh = self.variables.item().get("encoder.rnn.bias_hh_l0")
+
+        # (74, 64). (len(phonemes), emb)
+        self.dec_emb = self.variables.item().get("decoder.emb.weight")
+        # (3*128, 64)
+        self.dec_w_ih = self.variables.item().get("decoder.rnn.weight_ih_l0")
+        # (3*128, 128)
+        self.dec_w_hh = self.variables.item().get("decoder.rnn.weight_hh_l0")
+        # (3*128,)
+        self.dec_b_ih = self.variables.item().get("decoder.rnn.bias_ih_l0")
+        # (3*128,)
+        self.dec_b_hh = self.variables.item().get("decoder.rnn.bias_hh_l0")
+        # (74, 128)
+        self.fc_w = self.variables.item().get("decoder.fc.weight")
+        # (74,)
+        self.fc_b = self.variables.item().get("decoder.fc.bias")
+
+    def _sigmoid(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
+        rzn_ih = np.matmul(x, w_ih.T) + b_ih
+        rzn_hh = np.matmul(h, w_hh.T) + b_hh
+
+        rz_ih, n_ih = (
+            rzn_ih[:, : rzn_ih.shape[-1] * 2 // 3],
+            rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:],
+        )
+        rz_hh, n_hh = (
+            rzn_hh[:, : rzn_hh.shape[-1] * 2 // 3],
+            rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:],
+        )
+
+        rz = self._sigmoid(rz_ih + rz_hh)
+        r, z = np.split(rz, 2, -1)
+
+        n = np.tanh(n_ih + r * n_hh)
+        h = (1 - z) * n + z * h
+
+        return h
+
+    def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None) -> np.ndarray:
+        if h0 is None:
+            h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
+        h = h0  # initial hidden state
+
+        outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
+        for t in range(steps):
+            h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh)  # (b, h)
+            outputs[:, t, ::] = h
+
+        return outputs
+
+    def _encode(self, word: str) -> np.ndarray:
+        chars = list(word) + ["</s>"]
+        x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
+        x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
+
+        return x
+
+    def _short_word(self, word: str) -> Union[str, None]:
+        self.word = word
+        if self.word.endswith("."):
+            self.word = self.word.replace(".", "")
+            self.word = "-".join([i + "อ" for i in list(self.word)])
+            return self.word
+        return None
+
+    def _predict(self, word: str) -> str:
+        short_word = self._short_word(word)
+        if short_word is not None:
+            return short_word
+
+        # encoder
+        enc = self._encode(word)
+        enc = self._gru(
+            enc,
+            len(word) + 1,
+            self.enc_w_ih,
+            self.enc_w_hh,
+            self.enc_b_ih,
+            self.enc_b_hh,
+            h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32),
+        )
+        last_hidden = enc[:, -1, :]
+
+        # decoder
+        dec = np.take(self.dec_emb, [2], axis=0)  # 2: <s>
+        h = last_hidden
+
+        preds = []
+        for _ in range(20):
+            h = self._grucell(
+                dec,
+                h,
+                self.dec_w_ih,
+                self.dec_w_hh,
+                self.dec_b_ih,
+                self.dec_b_hh,
+            )  # (b, h)
+            logits = np.matmul(h, self.fc_w.T) + self.fc_b
+            pred = logits.argmax()
+            if pred == 3:
+                break
+            preds.append(pred)
+            dec = np.take(self.dec_emb, [pred], axis=0)
+
+        preds = [self.idx2p.get(idx, "<unk>") for idx in preds]
+
+        return preds
+
+    def __call__(self, word: str) -> str:
+        if not any(letter in word for letter in self.graphemes):
+            pron = [word]
+        else:  # predict for oov
+            pron = self._predict(word)
+
+        return "".join(pron)
+
+
+_THAI_W2P = Thai_W2P()
+
+
+def pronunciate(text: str) -> str:
+    """
+    Convert a Thai word to its pronunciation in Thai letters.
+
+    Input should be one single word.
+    """
+    global _THAI_W2P
+    return _THAI_W2P(text)
diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py
@@ -3,7 +3,7 @@
 import unittest
 
 import torch
-from pythainlp.transliterate import romanize, transliterate
+from pythainlp.transliterate import romanize, transliterate, pronunciate
 from pythainlp.transliterate.ipa import trans_list, xsampa_list
 from pythainlp.transliterate.thai2rom import ThaiTransliterator
 
@@ -134,3 +134,11 @@ def test_transliterate(self):
         self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
         self.assertIsNotNone(trans_list("คน"))
         self.assertIsNotNone(xsampa_list("คน"))
+
+    def test_pronunciate(self):
+        self.assertEqual(pronunciate(""), "")
+        self.assertIsNotNone(pronunciate("คน", engine="w2p"))
+        self.assertIsNotNone(pronunciate("แมว", engine="w2p"))
+        self.assertIsNotNone(pronunciate("มข.", engine="w2p"))
+        self.assertIsNotNone(pronunciate("มช.", engine="w2p"))
+        self.assertIsNotNone(pronunciate("jks", engine="w2p"))