PyThaiNLP · wannaphong · Jan 14, 2024 · Jan 1, 2024 · Jan 1, 2024 · Jan 5, 2024
diff --git a/docs/api/morpheme.rst b/docs/api/morpheme.rst
@@ -0,0 +1,13 @@
+.. currentmodule:: pythainlp.morpheme
+
+pythainlp.morpheme
+==================
+
+The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language.
+
+.. autofunction:: nighit
+
+.. autofunction:: is_native_thai
+    :noindex:
+
+    The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -77,11 +77,6 @@ Modules
 
     The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides.
 
-.. autofunction:: is_native_thai
-    :noindex:
-
-    The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks.
-
 .. autofunction:: isthai
     :noindex:
 

diff --git a/notebooks/create_words.ipynb b/notebooks/create_words.ipynb
@@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pythainlp.transliterate import pronunciate\n",
+    "from pythainlp import thai_consonants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'พุด-ทะ'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pronunciate(\"พุทธ\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'บู-ชา'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pronunciate(\"บูชา\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'อะ-นุก'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pronunciate(\"อนุค\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nighit(w1,w2): # read: https://www.trueplookpanya.com/learning/detail/1180\n",
+    "    if not str(w1).endswith('ํ') and len(w1)!=2:\n",
+    "        raise NotImplementedError(f\"The function doesn't support {w1}.\")\n",
+    "    list_w1 = list(w1)\n",
+    "    list_w2 = list(w2)\n",
+    "    newword = list()\n",
+    "    newword.append(list_w1[0])\n",
+    "    newword.append(\"ั\")\n",
+    "    consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]\n",
+    "    if consonant_start in [\"ก\",\"ช\",\"ค\",\"ข\",\"ง\"]:\n",
+    "        newword.append(\"ง\")\n",
+    "    elif consonant_start in [\"จ\",\"ฉ\",\"ช\",\"ฌ\"]:\n",
+    "        newword.append(\"ญ\")\n",
+    "    elif consonant_start in [\"ฎ\",\"ฐ\",\"ฑ\",\"ณ\"]:\n",
+    "        newword.append(\"ณ\")\n",
+    "    elif consonant_start in [\"ด\",\"ถ\",\"ท\",\"ธ\",\"น\"]:\n",
+    "        newword.append(\"น\")\n",
+    "    elif consonant_start in [\"ป\",\"ผ\",\"พ\",\"ภ\"]:\n",
+    "        newword.append(\"ม\")\n",
+    "    elif consonant_start in [\"ย\",\"ร\",\"ล\",\"ฬ\",\"ว\",\"ศ\",\"ษ\",\"ส\",\"ห\"]:\n",
+    "        newword.append(\"ง\")\n",
+    "    else:\n",
+    "        raise NotImplementedError(f\"The function doesn't support {w1} and {w2}.\")\n",
+    "    newword.extend(list_w2)\n",
+    "    return ''.join(newword)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert nighit(\"สํ\",\"คีต\")==\"สังคีต\"\n",
+    "assert nighit(\"สํ\",\"จร\")==\"สัญจร\"\n",
+    "assert nighit(\"สํ\",\"ฐาน\")==\"สัณฐาน\"\n",
+    "assert nighit(\"สํ\",\"นิษฐาน\")==\"สันนิษฐาน\"\n",
+    "assert nighit(\"สํ\",\"ปทา\")==\"สัมปทา\"\n",
+    "assert nighit(\"สํ\",\"โยค\")==\"สังโยค\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.13 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pythainlp/morpheme/__init__.py b/pythainlp/morpheme/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+PyThaiNLP morpheme
+"""
+__all__ = [
+    "nighit",
+    "is_native_thai"
+]
+from pythainlp.morpheme.word_formation import nighit
+from pythainlp.morpheme.thaiwordcheck import is_native_thai
diff --git a/pythainlp/morpheme/thaiwordcheck.py b/pythainlp/morpheme/thaiwordcheck.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+"""
+Check if a word is a "native Thai word"
+
+Adapted from
+https://github.com/wannaphong/open-thai-nlp-document/blob/master/check_thai_word.md
+
+References
+- ทีมงานทรูปลูกปัญญา 2015. ลักษณะของคำไทยแท้ \
+    http://www.trueplookpanya.com/learning/detail/30589-043067
+- วารุณี บำรุงรส 2010. คำไทยแท้ https://www.gotoknow.org/posts/377619
+"""
+import re
+
+_THANTHAKHAT_CHAR = "\u0e4c"  # Thanthakhat (cancellation of sound)
+
+# Non-native Thai characters
+_TH_NON_NATIVE_CHARS = {
+    "ฆ",
+    "ณ",
+    "ฌ",
+    "ฎ",
+    "ฏ",
+    "ฐ",
+    "ฑ",
+    "ฒ",
+    "ธ",
+    "ศ",
+    "ษ",
+    "ฬ",
+    _THANTHAKHAT_CHAR,
+}
+
+# Native Thai final consonants
+_TH_NATIVE_FINALS = {"ก", "ด", "บ", "น", "ง", "ม", "ย", "ว"}
+
+# Known native Thai words (exceptions)
+_TH_NATIVE_WORDS = {
+    "ฆ่า",
+    "เฆี่ยน",
+    "ศึก",
+    "ศอก",
+    "เศิก",
+    "เศร้า",
+    "ธ",
+    "ณ",
+    "ฯพณฯ",
+    "ใหญ่",
+    "หญ้า",
+    "ควาย",
+    "ความ",
+    "กริ่งเกรง",
+    "ผลิ",
+}
+
+# Diphthong prefixes (can start native Thai word)
+_TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"}
+
+# Thai consonant filter
+# O ANG (U+0E2D) is omitted, as it can be considered as vowel
+_TH_CONSONANTS_PATTERN = re.compile(r"[ก-ฬฮ]", re.U)
+
+
+def is_native_thai(word: str) -> bool:
+    """
+    Check if a word is an "native Thai word" (Thai: "คำไทยแท้")
+    This function is based on a simple heuristic algorithm
+    and cannot be entirely reliable.
+
+    :param str word: word
+    :return: True or False
+    :rtype: bool
+
+    :Example:
+
+    English word::
+
+        from pythainlp.util import is_native_thai
+
+        is_native_thai("Avocado")
+        # output: False
+
+    Native Thai word::
+
+        is_native_thai("มะม่วง")
+        # output: True
+        is_native_thai("ตะวัน")
+        # output: True
+
+    Non-native Thai word::
+
+        is_native_thai("สามารถ")
+        # output: False
+        is_native_thai("อิสริยาภรณ์")
+        # output: False
+    """
+    if not isinstance(word, str) or not word.strip():
+        return False
+
+    word = word.strip()
+
+    # Known native Thai words (exceptions)
+    if word in _TH_NATIVE_WORDS:
+        return True
+
+    # If a word contains non-Thai chars, it is not a native Thai
+    if any(ch in word for ch in _TH_NON_NATIVE_CHARS):
+        return False
+
+    # If it does not contain any Thai consonants -> it cannot be Thai
+    chs = re.findall(_TH_CONSONANTS_PATTERN, word)
+    if not chs:
+        return False
+
+    # If there's only one Thai consonant -> it can be a native Thai
+    if len(chs) == 1:
+        return True
+
+    # If a word ends with native final, it can be a native Thai
+    if word[-1] in _TH_NATIVE_FINALS:
+        return True
+
+    # Note: This will not work, as it check the whole word, not the prefix.
+    # Prefix-sensitive tokenization is required in order to be able to check this.
+    if word in _TH_PREFIX_DIPHTHONG:
+        return True
+
+    return False
diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+from pythainlp import thai_consonants
+
+
+def nighit(w1: str, w2: str) -> str:
+    """
+    Nighit (นิคหิต or  ํ ) is the niggahita in Thai language for create new \
+    words from Pali language in Thai.
+    The function use simple method to create new Thai word from two words \
+    that the root is from Pali language.
+
+    Read more: https://www.trueplookpanya.com/learning/detail/1180
+
+    :param str w1: A Thai word that has a nighit.
+    :param str w2: A Thai word.
+    :return: Thai word.
+    :rtype: str
+    :Example:
+    ::
+        from pythainlp.morpheme import nighit
+
+        assert nighit("สํ","คีต")=="สังคีต"
+        assert nighit("สํ","จร")=="สัญจร"
+        assert nighit("สํ","ฐาน")=="สัณฐาน"
+        assert nighit("สํ","นิษฐาน")=="สันนิษฐาน"
+        assert nighit("สํ","ปทา")=="สัมปทา"
+        assert nighit("สํ","โยค")=="สังโยค"
+    """
+    if not str(w1).endswith('ํ') and len(w1) != 2:
+        raise NotImplementedError(f"The function doesn't support {w1}.")
+    list_w1 = list(w1)
+    list_w2 = list(w2)
+    newword = list()
+    newword.append(list_w1[0])
+    newword.append("ั")
+    consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]
+    if consonant_start in ["ก", "ช", "ค", "ข", "ง"]:
+        newword.append("ง")
+    elif consonant_start in ["จ", "ฉ", "ช", "ฌ"]:
+        newword.append("ญ")
+    elif consonant_start in ["ฎ", "ฐ", "ฑ", "ณ"]:
+        newword.append("ณ")
+    elif consonant_start in ["ด", "ถ", "ท", "ธ", "น"]:
+        newword.append("น")
+    elif consonant_start in ["ป", "ผ", "พ", "ภ"]:
+        newword.append("ม")
+    elif consonant_start in ["ย", "ร", "ล", "ฬ", "ว", "ศ", "ษ", "ส", "ห"]:
+        newword.append("ง")
+    else:
+        raise NotImplementedError(f"""
+        The function doesn't support {w1} and {w2}.
+        """)
+    newword.extend(list_w2)
+    return ''.join(newword)