PyThaiNLP · bact · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -17,8 +17,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ["macos-latest", "ubuntu-latest", "windows-latest"]
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        os: ["ubuntu-latest", "windows-latest", "macos-latest"]
+        python-version: ["3.13", "3.12", "3.11", "3.10", "3.9"]
 
     runs-on: ${{ matrix.os }}
     env:

diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py
@@ -6,6 +6,8 @@
 Common lists of words.
 """
 
+import ast
+
 __all__ = [
     "countries",
     "find_synonyms",
@@ -56,9 +58,9 @@
 
 _THAI_ORST_WORDS: FrozenSet[str] = frozenset()
 
-_THAI_DICT = {}
-_THAI_WSD_DICT = {}
-_THAI_SYNONYMS = {}
+_THAI_DICT: dict[str, list] = {}
+_THAI_WSD_DICT: dict[str, list] = {}
+_THAI_SYNONYMS: dict[str, list] = {}
 
 
 def countries() -> FrozenSet[str]:
@@ -268,17 +270,22 @@ def thai_dict() -> dict:
     :rtype: dict
     """
     global _THAI_DICT
-    if not _THAI_DICT:
-        import csv
-
-        _THAI_DICT = {"word": [], "meaning": []}
-        with open(
-            get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
-        ) as csvfile:
-            reader = csv.DictReader(csvfile, delimiter=",")
-            for row in reader:
-                _THAI_DICT["word"].append(row["word"])
-                _THAI_DICT["meaning"].append(row["meaning"])
+    if _THAI_DICT:
+        return _THAI_DICT
+
+    import csv
+
+    path = get_corpus_path("thai_dict")
+    if not path:
+        return _THAI_DICT
+    path = str(path)
+
+    _THAI_DICT = {"word": [], "meaning": []}
+    with open(path, newline="\n", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile, delimiter=",")
+        for row in reader:
+            _THAI_DICT["word"].append(row["word"])
+            _THAI_DICT["meaning"].append(row["meaning"])
 
     return _THAI_DICT
 
@@ -293,18 +300,20 @@ def thai_wsd_dict() -> dict:
     :rtype: dict
     """
     global _THAI_WSD_DICT
-    if not _THAI_WSD_DICT:
-        _thai_wsd = thai_dict()
-        _THAI_WSD_DICT = {"word": [], "meaning": []}
-        for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
-            _all_value = list(eval(j).values())
-            _use = []
-            for k in _all_value:
-                _use.extend(k)
-            _use = list(set(_use))
-            if len(_use) > 1:
-                _THAI_WSD_DICT["word"].append(i)
-                _THAI_WSD_DICT["meaning"].append(_use)
+    if _THAI_WSD_DICT:
+        return _THAI_WSD_DICT
+
+    thai_wsd = thai_dict()
+    _THAI_WSD_DICT = {"word": [], "meaning": []}
+    for i, j in zip(thai_wsd["word"], thai_wsd["meaning"]):
+        all_value = list(ast.literal_eval(j).values())
+        use = []
+        for k in all_value:
+            use.extend(k)
+        use = list(set(use))
+        if len(use) > 1:
+            _THAI_WSD_DICT["word"].append(i)
+            _THAI_WSD_DICT["meaning"].append(use)
 
     return _THAI_WSD_DICT
 
@@ -319,18 +328,23 @@ def thai_synonyms() -> dict:
     :rtype: dict
     """
     global _THAI_SYNONYMS
-    if not _THAI_SYNONYMS:
-        import csv
-
-        _THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
-        with open(
-            get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
-        ) as csvfile:
-            reader = csv.DictReader(csvfile, delimiter=",")
-            for row in reader:
-                _THAI_SYNONYMS["word"].append(row["word"])
-                _THAI_SYNONYMS["pos"].append(row["pos"])
-                _THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
+    if _THAI_SYNONYMS:
+        return _THAI_SYNONYMS
+
+    import csv
+
+    path = get_corpus_path("thai_synonym")
+    if not path:
+        return _THAI_SYNONYMS
+    path = str(path)
+
+    _THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
+    with open(path, newline="\n", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile, delimiter=",")
+        for row in reader:
+            _THAI_SYNONYMS["word"].append(row["word"])
+            _THAI_SYNONYMS["pos"].append(row["pos"])
+            _THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
 
     return _THAI_SYNONYMS
 

diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py
@@ -15,43 +15,51 @@
 
 from pythainlp.corpus import get_corpus_path
 
-_FILENAME = "oscar_icu"
+_OSCAR_FILENAME = "oscar_icu"
 
 
 def word_freqs() -> List[Tuple[str, int]]:
     """
     Get word frequency from OSCAR Corpus (words tokenized using ICU)
     """
-    word_freqs = []
-    _path = get_corpus_path(_FILENAME)
-    with open(_path, "r", encoding="utf-8-sig") as f:
-        _data = list(f.readlines())
-        del _data[0]
-        for line in _data:
-            _temp = line.strip().split(",")
-            if len(_temp) >= 2:
-                if _temp[0] != " " and '"' not in _temp[0]:
-                    word_freqs.append((_temp[0], int(_temp[1])))
-                elif _temp[0] == " ":
-                    word_freqs.append(("<s/>", int(_temp[1])))
-
-    return word_freqs
-
-
-def unigram_word_freqs() -> defaultdict:
+    freqs: list[tuple[str, int]] = []
+    path = get_corpus_path(_OSCAR_FILENAME)
+    if not path:
+        return freqs
+    path = str(path)
+
+    with open(path, "r", encoding="utf-8-sig") as f:
+        lines = list(f.readlines())
+        del lines[0]
+        for line in lines:
+            temp = line.strip().split(",")
+            if len(temp) >= 2:
+                if temp[0] != " " and '"' not in temp[0]:
+                    freqs.append((temp[0], int(temp[1])))
+                elif temp[0] == " ":
+                    freqs.append(("<s/>", int(temp[1])))
+
+    return freqs
+
+
+def unigram_word_freqs() -> dict[str, int]:
     """
     Get unigram word frequency from OSCAR Corpus (words tokenized using ICU)
     """
-    _path = get_corpus_path(_FILENAME)
-    _word_freqs = defaultdict(int)
-    with open(_path, "r", encoding="utf-8-sig") as fh:
-        _data = list(fh.readlines())
-        del _data[0]
-        for i in _data:
-            _temp = i.strip().split(",")
-            if _temp[0] != " " and '"' not in _temp[0]:
-                _word_freqs[_temp[0]] = int(_temp[-1])
-            elif _temp[0] == " ":
-                _word_freqs["<s/>"] = int(_temp[-1])
-
-    return _word_freqs
+    freqs: dict[str, int] = defaultdict(int)
+    path = get_corpus_path(_OSCAR_FILENAME)
+    if not path:
+        return freqs
+    path = str(path)
+
+    with open(path, "r", encoding="utf-8-sig") as fh:
+        lines = list(fh.readlines())
+        del lines[0]
+        for i in lines:
+            temp = i.strip().split(",")
+            if temp[0] != " " and '"' not in temp[0]:
+                freqs[temp[0]] = int(temp[-1])
+            elif temp[0] == " ":
+                freqs["<s/>"] = int(temp[-1])
+
+    return freqs
diff --git a/pythainlp/corpus/th_en_translit.py b/pythainlp/corpus/th_en_translit.py
@@ -28,7 +28,7 @@ def get_transliteration_dict() -> defaultdict:
     """
     Get Thai to English transliteration dictionary.
 
-    The returned dict is in defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format.
+    The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.
     """
     path = path_pythainlp_corpus(_FILE_NAME)
     if not path:
@@ -38,7 +38,7 @@ def get_transliteration_dict() -> defaultdict:
         )
 
     # use list, as one word can have multiple transliterations.
-    trans_dict = defaultdict(
+    trans_dict: defaultdict[str, dict[str, list]] = defaultdict(
         lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
     )
     try:
@@ -61,11 +61,11 @@ def get_transliteration_dict() -> defaultdict:
                         en_follow_rtgs
                     )
 
-    except ValueError:
+    except ValueError as exc:
         raise ValueError(
-            f"Unable to parse {_FILE_NAME}."
+            f"Unable to parse {_FILE_NAME}. "
             f"Make sure it is a 3-column tab-separated file with header."
-        )
+        ) from exc
     else:
         return trans_dict
 

diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
@@ -6,20 +6,20 @@
 """
 
 __all__ = [
-    "word_freqs",
-    "unigram_word_freqs",
     "bigram_word_freqs",
     "trigram_word_freqs",
+    "unigram_word_freqs",
+    "word_freqs",
 ]
 
 from collections import defaultdict
 from typing import List, Tuple
 
 from pythainlp.corpus import get_corpus, get_corpus_path
 
-_FILENAME = "tnc_freq.txt"
-_BIGRAM = "tnc_bigram_word_freqs"
-_TRIGRAM = "tnc_trigram_word_freqs"
+_UNIGRAM_FILENAME = "tnc_freq.txt"
+_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
+_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"
 
 
 def word_freqs() -> List[Tuple[str, int]]:
@@ -30,53 +30,61 @@ def word_freqs() -> List[Tuple[str, int]]:
 
     Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
     """
-    lines = list(get_corpus(_FILENAME))
-    word_freqs = []
+    freqs: list[tuple[str, int]] = []
+    lines = list(get_corpus(_UNIGRAM_FILENAME))
     for line in lines:
         word_freq = line.split("\t")
         if len(word_freq) >= 2:
-            word_freqs.append((word_freq[0], int(word_freq[1])))
+            freqs.append((word_freq[0], int(word_freq[1])))
 
-    return word_freqs
+    return freqs
 
 
-def unigram_word_freqs() -> defaultdict:
+def unigram_word_freqs() -> dict[str, int]:
     """
     Get unigram word frequency from Thai National Corpus (TNC)
     """
-    lines = list(get_corpus(_FILENAME))
-    _word_freqs = defaultdict(int)
+    freqs: dict[str, int] = defaultdict(int)
+    lines = list(get_corpus(_UNIGRAM_FILENAME))
     for i in lines:
         _temp = i.strip().split("	")
         if len(_temp) >= 2:
-            _word_freqs[_temp[0]] = int(_temp[-1])
+            freqs[_temp[0]] = int(_temp[-1])
 
-    return _word_freqs
+    return freqs
 
 
-def bigram_word_freqs() -> defaultdict:
+def bigram_word_freqs() -> dict[Tuple[str, str], int]:
     """
     Get bigram word frequency from Thai National Corpus (TNC)
     """
-    _path = get_corpus_path(_BIGRAM)
-    _word_freqs = defaultdict(int)
-    with open(_path, "r", encoding="utf-8-sig") as fh:
+    freqs: dict[tuple[str, str], int] = defaultdict(int)
+    path = get_corpus_path(_BIGRAM_CORPUS_NAME)
+    if not path:
+        return freqs
+    path = str(path)
+
+    with open(path, "r", encoding="utf-8-sig") as fh:
         for i in fh.readlines():
-            _temp = i.strip().split("	")
-            _word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
+            temp = i.strip().split("	")
+            freqs[(temp[0], temp[1])] = int(temp[-1])
 
-    return _word_freqs
+    return freqs
 
 
-def trigram_word_freqs() -> defaultdict:
+def trigram_word_freqs() -> dict[Tuple[str, str, str], int]:
     """
     Get trigram word frequency from Thai National Corpus (TNC)
     """
-    _path = get_corpus_path(_TRIGRAM)
-    _word_freqs = defaultdict(int)
-    with open(_path, "r", encoding="utf-8-sig") as fh:
+    freqs: dict[tuple[str, str, str], int] = defaultdict(int)
+    path = get_corpus_path(_TRIGRAM_CORPUS_NAME)
+    if not path:
+        return freqs
+    path = str(path)
+
+    with open(path, "r", encoding="utf-8-sig") as fh:
         for i in fh.readlines():
-            _temp = i.strip().split("	")
-            _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
+            temp = i.strip().split("	")
+            freqs[(temp[0], temp[1], temp[2])] = int(temp[-1])
 
-    return _word_freqs
+    return freqs