Skip to content

Commit

Permalink
Merge pull request #882 from bact/rename-corpus-volubilis-wikipedia
Browse files Browse the repository at this point in the history
Rename corpus function names for consistency / Fix types
  • Loading branch information
bact authored Dec 6, 2023
2 parents 297aadc + 94ec1fc commit 9bd951b
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 106 deletions.
9 changes: 7 additions & 2 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ get_corpus
.. autofunction:: get_corpus
:noindex:

get_corpus_as_is
~~~~~~~~~~
.. autofunction:: get_corpus_as_is
:noindex:

get_corpus_db
~~~~~~~~~~~~~~
.. autofunction:: get_corpus_db
Expand Down Expand Up @@ -77,9 +82,9 @@ thai_orst_words
.. autofunction:: thai_orst_words
:noindex:

thai_synonym
thai_synonyms
~~~~~~~~~~~~~~
.. autofunction:: thai_synonym
.. autofunction:: thai_synonyms
:noindex:

thai_syllables
Expand Down
12 changes: 8 additions & 4 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"countries",
"download",
"get_corpus",
"get_corpus_as_is",
"get_corpus_db",
"get_corpus_db_detail",
"get_corpus_default_db",
Expand All @@ -33,10 +34,11 @@
"thai_stopwords",
"thai_syllables",
"thai_synonym",
"thai_synonyms",
"thai_volubilis_words",
"thai_wikipedia_titles",
"thai_words",
"thai_wsd_dict",
"volubilis",
"wikipedia_titles",
]

import os
Expand Down Expand Up @@ -88,6 +90,7 @@ def corpus_db_path() -> str:
from pythainlp.corpus.core import (
download,
get_corpus,
get_corpus_as_is,
get_corpus_db,
get_corpus_db_detail,
get_corpus_default_db,
Expand All @@ -108,9 +111,10 @@ def corpus_db_path() -> str:
thai_stopwords,
thai_syllables,
thai_synonym,
thai_synonyms,
thai_words,
thai_wsd_dict,
)
from pythainlp.corpus.icu import thai_icu_words
from pythainlp.corpus.volubilis import volubilis
from pythainlp.corpus.wikipedia_titles import wikipedia_titles
from pythainlp.corpus.volubilis import thai_volubilis_words
from pythainlp.corpus.wikipedia import thai_wikipedia_titles
83 changes: 48 additions & 35 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,49 +12,51 @@
"thai_female_names",
"thai_male_names",
"thai_negations",
"thai_dict",
"thai_stopwords",
"thai_syllables",
"thai_synonym",
"thai_synonyms",
"thai_words",
"thai_dict",
"thai_wsd_dict",
"thai_synonym",
]

from typing import FrozenSet, List, Union
import warnings

from pythainlp.corpus import get_corpus, get_corpus_path
from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path

_THAI_COUNTRIES = set()
_THAI_COUNTRIES: FrozenSet[str] = frozenset()
_THAI_COUNTRIES_FILENAME = "countries_th.txt"

_THAI_THAILAND_PROVINCES = set()
_THAI_THAILAND_PROVINCES_DETAILS = []
_THAI_THAILAND_PROVINCES: FrozenSet[str] = frozenset()
_THAI_THAILAND_PROVINCES_DETAILS: List[dict] = []
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"

_THAI_SYLLABLES = set()
_THAI_SYLLABLES: FrozenSet[str] = frozenset()
_THAI_SYLLABLES_FILENAME = "syllables_th.txt"

_THAI_WORDS = set()
_THAI_WORDS: FrozenSet[str] = frozenset()
_THAI_WORDS_FILENAME = "words_th.txt"

_THAI_STOPWORDS = set()
_THAI_STOPWORDS: FrozenSet[str] = frozenset()
_THAI_STOPWORDS_FILENAME = "stopwords_th.txt"

_THAI_NEGATIONS = set()
_THAI_NEGATIONS: FrozenSet[str] = frozenset()
_THAI_NEGATIONS_FILENAME = "negations_th.txt"

_THAI_FAMLIY_NAMES = set()
_THAI_FAMLIY_NAMES: FrozenSet[str] = frozenset()
_THAI_FAMLIY_NAMES_FILENAME = "family_names_th.txt"
_THAI_FEMALE_NAMES = set()
_THAI_FEMALE_NAMES: FrozenSet[str] = frozenset()
_THAI_FEMALE_NAMES_FILENAME = "person_names_female_th.txt"
_THAI_MALE_NAMES = set()
_THAI_MALE_NAMES: FrozenSet[str] = frozenset()
_THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt"

_THAI_ORST_WORDS = set()
_THAI_ORST_WORDS: FrozenSet[str] = frozenset()

_THAI_DICT = {}
_THAI_WSD_DICT = {}
_THAI_SYNONYM = None
_THAI_SYNONYMS = {}


def countries() -> FrozenSet[str]:
Expand All @@ -74,7 +76,7 @@ def countries() -> FrozenSet[str]:
return _THAI_COUNTRIES


def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
def provinces(details: bool = False) -> Union[FrozenSet[str], List[dict]]:
"""
Return a frozenset of Thailand province names in Thai such as "กระบี่",
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
Expand All @@ -96,7 +98,7 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]:
provs = set()
prov_details = []

for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
for line in get_corpus_as_is(_THAI_THAILAND_PROVINCES_FILENAME):
p = line.split(",")

prov = {}
Expand Down Expand Up @@ -155,14 +157,14 @@ def thai_orst_words() -> FrozenSet[str]:
"""
Return a frozenset of Thai words from Royal Society of Thailand
\n(See: `dev/pythainlp/corpus/thai_orst_words.txt\
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thai_orst_words>`_)
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/orst_words_th.txt>`_)
:return: :class:`frozenset` containing words in the Thai language.
:rtype: :class:`frozenset`
"""
global _THAI_ORST_WORDS
if not _THAI_ORST_WORDS:
_THAI_ORST_WORDS = get_corpus("thai_orst_words.txt")
_THAI_ORST_WORDS = get_corpus("orst_words_th.txt")

return _THAI_ORST_WORDS

Expand Down Expand Up @@ -266,8 +268,11 @@ def thai_dict() -> dict:
global _THAI_DICT
if not _THAI_DICT:
import csv
_THAI_DICT = {"word":[], "meaning":[]}
with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile:

_THAI_DICT = {"word": [], "meaning": []}
with open(
get_corpus_path("thai_dict"), newline="\n", encoding="utf-8"
) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_DICT["word"].append(row["word"])
Expand All @@ -288,38 +293,46 @@ def thai_wsd_dict() -> dict:
global _THAI_WSD_DICT
if not _THAI_WSD_DICT:
_thai_wsd = thai_dict()
_THAI_WSD_DICT = {"word":[],"meaning":[]}
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
_THAI_WSD_DICT = {"word": [], "meaning": []}
for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]):
_all_value = list(eval(j).values())
_use = []
for k in _all_value:
_use.extend(k)
_use=list(set(_use))
if len(_use)>1:
_use = list(set(_use))
if len(_use) > 1:
_THAI_WSD_DICT["word"].append(i)
_THAI_WSD_DICT["meaning"].append(_use)

return _THAI_WSD_DICT


def thai_synonym() -> dict:
def thai_synonyms() -> dict:
"""
Return Thai synonym.
Return Thai synonyms.
\n(See: `thai_synonym\
<https://pythainlp.github.io/pythainlp-corpus/thai_synonym.html>`_)
:return: Thai words with part-of-speech type and synonym
:rtype: dict
"""
global _THAI_SYNONYM
if _THAI_SYNONYM is None:
global _THAI_SYNONYMS
if not _THAI_SYNONYMS:
import csv
_THAI_SYNONYM = {"word":[], "pos":[], "synonym":[]}
with open(get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8") as csvfile:

_THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []}
with open(
get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8"
) as csvfile:
reader = csv.DictReader(csvfile, delimiter=",")
for row in reader:
_THAI_SYNONYM["word"].append(row["word"])
_THAI_SYNONYM["pos"].append(row["pos"])
_THAI_SYNONYM["synonym"].append(row["synonym"].split("|"))
_THAI_SYNONYMS["word"].append(row["word"])
_THAI_SYNONYMS["pos"].append(row["pos"])
_THAI_SYNONYMS["synonym"].append(row["synonym"].split("|"))

return _THAI_SYNONYMS

return _THAI_SYNONYM

def thai_synonym() -> dict:
warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning)
return thai_synonyms()
Loading

0 comments on commit 9bd951b

Please sign in to comment.