Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix warnings and types #978

Merged
merged 5 commits into from
Nov 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ jobs:
strategy:
fail-fast: false
matrix:
os: ["macos-latest", "ubuntu-latest", "windows-latest"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
os: ["ubuntu-latest", "windows-latest", "macos-latest"]
python-version: ["3.13", "3.12", "3.11", "3.10", "3.9"]

runs-on: ${{ matrix.os }}
env:
Expand Down
90 changes: 52 additions & 38 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
Common lists of words.
"""

import ast

__all__ = [
"countries",
"find_synonyms",
Expand Down Expand Up @@ -56,9 +58,9 @@

_THAI_ORST_WORDS: FrozenSet[str] = frozenset()

_THAI_DICT = {}
_THAI_WSD_DICT = {}
_THAI_SYNONYMS = {}
_THAI_DICT: dict[str, list] = {}
_THAI_WSD_DICT: dict[str, list] = {}
_THAI_SYNONYMS: dict[str, list] = {}


def countries() -> FrozenSet[str]:
Expand Down Expand Up @@ -268,17 +270,22 @@ def thai_dict() -> dict:
:rtype: dict
"""
global _THAI_DICT
if not _THAI_DICT:
import csv

_THAI_DICT = {"word": [], "meaning": []}
with open(
get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_DICT["word"].append(row["word"])
_THAI_DICT["meaning"].append(row["meaning"])
if _THAI_DICT:
return _THAI_DICT

import csv

path = get_corpus_path("thai_dict")
if not path:
return _THAI_DICT
path = str(path)

_THAI_DICT = {"word": [], "meaning": []}
with open(path, newline="\n", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_DICT["word"].append(row["word"])
_THAI_DICT["meaning"].append(row["meaning"])

return _THAI_DICT

Expand All @@ -293,18 +300,20 @@ def thai_wsd_dict() -> dict:
:rtype: dict
"""
global _THAI_WSD_DICT
if not _THAI_WSD_DICT:
_thai_wsd = thai_dict()
_THAI_WSD_DICT = {"word": [], "meaning": []}
for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
_all_value = list(eval(j).values())
_use = []
for k in _all_value:
_use.extend(k)
_use = list(set(_use))
if len(_use) > 1:
_THAI_WSD_DICT["word"].append(i)
_THAI_WSD_DICT["meaning"].append(_use)
if _THAI_WSD_DICT:
return _THAI_WSD_DICT

thai_wsd = thai_dict()
_THAI_WSD_DICT = {"word": [], "meaning": []}
for i, j in zip(thai_wsd["word"], thai_wsd["meaning"]):
all_value = list(ast.literal_eval(j).values())
use = []
for k in all_value:
use.extend(k)
use = list(set(use))
if len(use) > 1:
_THAI_WSD_DICT["word"].append(i)
_THAI_WSD_DICT["meaning"].append(use)

return _THAI_WSD_DICT

Expand All @@ -319,18 +328,23 @@ def thai_synonyms() -> dict:
:rtype: dict
"""
global _THAI_SYNONYMS
if not _THAI_SYNONYMS:
import csv

_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
with open(
get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_SYNONYMS["word"].append(row["word"])
_THAI_SYNONYMS["pos"].append(row["pos"])
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))
if _THAI_SYNONYMS:
return _THAI_SYNONYMS

import csv

path = get_corpus_path("thai_synonym")
if not path:
return _THAI_SYNONYMS
path = str(path)

_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
with open(path, newline="\n", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_SYNONYMS["word"].append(row["word"])
_THAI_SYNONYMS["pos"].append(row["pos"])
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))

return _THAI_SYNONYMS

Expand Down
70 changes: 39 additions & 31 deletions pythainlp/corpus/oscar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,43 +15,51 @@

from pythainlp.corpus import get_corpus_path

_FILENAME = "oscar_icu"
_OSCAR_FILENAME = "oscar_icu"


def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from OSCAR Corpus (words tokenized using ICU)
"""
word_freqs = []
_path = get_corpus_path(_FILENAME)
with open(_path, "r", encoding="utf-8-sig") as f:
_data = list(f.readlines())
del _data[0]
for line in _data:
_temp = line.strip().split(",")
if len(_temp) >= 2:
if _temp[0] != " " and '"' not in _temp[0]:
word_freqs.append((_temp[0], int(_temp[1])))
elif _temp[0] == " ":
word_freqs.append(("<s/>", int(_temp[1])))

return word_freqs


def unigram_word_freqs() -> defaultdict:
freqs: list[tuple[str, int]] = []
path = get_corpus_path(_OSCAR_FILENAME)
if not path:
return freqs
path = str(path)

with open(path, "r", encoding="utf-8-sig") as f:
lines = list(f.readlines())
del lines[0]
for line in lines:
temp = line.strip().split(",")
if len(temp) >= 2:
if temp[0] != " " and '"' not in temp[0]:
freqs.append((temp[0], int(temp[1])))
elif temp[0] == " ":
freqs.append(("<s/>", int(temp[1])))

return freqs


def unigram_word_freqs() -> dict[str, int]:
"""
Get unigram word frequency from OSCAR Corpus (words tokenized using ICU)
"""
_path = get_corpus_path(_FILENAME)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
_data = list(fh.readlines())
del _data[0]
for i in _data:
_temp = i.strip().split(",")
if _temp[0] != " " and '"' not in _temp[0]:
_word_freqs[_temp[0]] = int(_temp[-1])
elif _temp[0] == " ":
_word_freqs["<s/>"] = int(_temp[-1])

return _word_freqs
freqs: dict[str, int] = defaultdict(int)
path = get_corpus_path(_OSCAR_FILENAME)
if not path:
return freqs
path = str(path)

with open(path, "r", encoding="utf-8-sig") as fh:
lines = list(fh.readlines())
del lines[0]
for i in lines:
temp = i.strip().split(",")
if temp[0] != " " and '"' not in temp[0]:
freqs[temp[0]] = int(temp[-1])
elif temp[0] == " ":
freqs["<s/>"] = int(temp[-1])

return freqs
10 changes: 5 additions & 5 deletions pythainlp/corpus/th_en_translit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_transliteration_dict() -> defaultdict:
"""
Get Thai to English transliteration dictionary.

The returned dict is in defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format.
The returned dict is in dict[str, dict[List[str], List[Optional[bool]]]] format.
"""
path = path_pythainlp_corpus(_FILE_NAME)
if not path:
Expand All @@ -38,7 +38,7 @@ def get_transliteration_dict() -> defaultdict:
)

# use list, as one word can have multiple transliterations.
trans_dict = defaultdict(
trans_dict: defaultdict[str, dict[str, list]] = defaultdict(
lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []}
)
try:
Expand All @@ -61,11 +61,11 @@ def get_transliteration_dict() -> defaultdict:
en_follow_rtgs
)

except ValueError:
except ValueError as exc:
raise ValueError(
f"Unable to parse {_FILE_NAME}."
f"Unable to parse {_FILE_NAME}. "
f"Make sure it is a 3-column tab-separated file with header."
)
) from exc
else:
return trans_dict

Expand Down
64 changes: 36 additions & 28 deletions pythainlp/corpus/tnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
"""

__all__ = [
"word_freqs",
"unigram_word_freqs",
"bigram_word_freqs",
"trigram_word_freqs",
"unigram_word_freqs",
"word_freqs",
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus, get_corpus_path

_FILENAME = "tnc_freq.txt"
_BIGRAM = "tnc_bigram_word_freqs"
_TRIGRAM = "tnc_trigram_word_freqs"
_UNIGRAM_FILENAME = "tnc_freq.txt"
_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"


def word_freqs() -> List[Tuple[str, int]]:
Expand All @@ -30,53 +30,61 @@ def word_freqs() -> List[Tuple[str, int]]:

Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
"""
lines = list(get_corpus(_FILENAME))
word_freqs = []
freqs: list[tuple[str, int]] = []
lines = list(get_corpus(_UNIGRAM_FILENAME))
for line in lines:
word_freq = line.split("\t")
if len(word_freq) >= 2:
word_freqs.append((word_freq[0], int(word_freq[1])))
freqs.append((word_freq[0], int(word_freq[1])))

return word_freqs
return freqs


def unigram_word_freqs() -> defaultdict:
def unigram_word_freqs() -> dict[str, int]:
"""
Get unigram word frequency from Thai National Corpus (TNC)
"""
lines = list(get_corpus(_FILENAME))
_word_freqs = defaultdict(int)
freqs: dict[str, int] = defaultdict(int)
lines = list(get_corpus(_UNIGRAM_FILENAME))
for i in lines:
_temp = i.strip().split(" ")
if len(_temp) >= 2:
_word_freqs[_temp[0]] = int(_temp[-1])
freqs[_temp[0]] = int(_temp[-1])

return _word_freqs
return freqs


def bigram_word_freqs() -> defaultdict:
def bigram_word_freqs() -> dict[Tuple[str, str], int]:
"""
Get bigram word frequency from Thai National Corpus (TNC)
"""
_path = get_corpus_path(_BIGRAM)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
freqs: dict[tuple[str, str], int] = defaultdict(int)
path = get_corpus_path(_BIGRAM_CORPUS_NAME)
if not path:
return freqs
path = str(path)

with open(path, "r", encoding="utf-8-sig") as fh:
for i in fh.readlines():
_temp = i.strip().split(" ")
_word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
temp = i.strip().split(" ")
freqs[(temp[0], temp[1])] = int(temp[-1])

return _word_freqs
return freqs


def trigram_word_freqs() -> defaultdict:
def trigram_word_freqs() -> dict[Tuple[str, str, str], int]:
"""
Get trigram word frequency from Thai National Corpus (TNC)
"""
_path = get_corpus_path(_TRIGRAM)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
freqs: dict[tuple[str, str, str], int] = defaultdict(int)
path = get_corpus_path(_TRIGRAM_CORPUS_NAME)
if not path:
return freqs
path = str(path)

with open(path, "r", encoding="utf-8-sig") as fh:
for i in fh.readlines():
_temp = i.strip().split(" ")
_word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
temp = i.strip().split(" ")
freqs[(temp[0], temp[1], temp[2])] = int(temp[-1])

return _word_freqs
return freqs
Loading