diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index fbd25822d..ddb81e8f6 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -17,6 +17,11 @@ get_corpus .. autofunction:: get_corpus :noindex: +get_corpus_as_is +~~~~~~~~~~ +.. autofunction:: get_corpus_as_is + :noindex: + get_corpus_db ~~~~~~~~~~~~~~ .. autofunction:: get_corpus_db @@ -77,9 +82,9 @@ thai_orst_words .. autofunction:: thai_orst_words :noindex: -thai_synonym +thai_synonyms ~~~~~~~~~~~~~~ -.. autofunction:: thai_synonym +.. autofunction:: thai_synonyms :noindex: thai_syllables diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 7a087f603..7fff98d85 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -15,6 +15,7 @@ "countries", "download", "get_corpus", + "get_corpus_as_is", "get_corpus_db", "get_corpus_db_detail", "get_corpus_default_db", @@ -33,10 +34,11 @@ "thai_stopwords", "thai_syllables", "thai_synonym", + "thai_synonyms", + "thai_volubilis_words", + "thai_wikipedia_titles", "thai_words", "thai_wsd_dict", - "volubilis", - "wikipedia_titles", ] import os @@ -88,6 +90,7 @@ def corpus_db_path() -> str: from pythainlp.corpus.core import ( download, get_corpus, + get_corpus_as_is, get_corpus_db, get_corpus_db_detail, get_corpus_default_db, @@ -108,9 +111,10 @@ def corpus_db_path() -> str: thai_stopwords, thai_syllables, thai_synonym, + thai_synonyms, thai_words, thai_wsd_dict, ) from pythainlp.corpus.icu import thai_icu_words -from pythainlp.corpus.volubilis import volubilis -from pythainlp.corpus.wikipedia_titles import wikipedia_titles +from pythainlp.corpus.volubilis import thai_volubilis_words +from pythainlp.corpus.wikipedia import thai_wikipedia_titles diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 35b4c8ed7..36a8c718c 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -12,49 +12,51 @@ "thai_female_names", "thai_male_names", "thai_negations", + "thai_dict", "thai_stopwords", "thai_syllables", + "thai_synonym", + "thai_synonyms", "thai_words", - "thai_dict", "thai_wsd_dict", - "thai_synonym", ] from typing import FrozenSet, List, Union +import warnings -from pythainlp.corpus import get_corpus, get_corpus_path +from pythainlp.corpus import get_corpus, get_corpus_as_is, get_corpus_path -_THAI_COUNTRIES = set() +_THAI_COUNTRIES: FrozenSet[str] = frozenset() _THAI_COUNTRIES_FILENAME = "countries_th.txt" -_THAI_THAILAND_PROVINCES = set() -_THAI_THAILAND_PROVINCES_DETAILS = [] +_THAI_THAILAND_PROVINCES: FrozenSet[str] = frozenset() +_THAI_THAILAND_PROVINCES_DETAILS: List[dict] = [] _THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv" -_THAI_SYLLABLES = set() +_THAI_SYLLABLES: FrozenSet[str] = frozenset() _THAI_SYLLABLES_FILENAME = "syllables_th.txt" -_THAI_WORDS = set() +_THAI_WORDS: FrozenSet[str] = frozenset() _THAI_WORDS_FILENAME = "words_th.txt" -_THAI_STOPWORDS = set() +_THAI_STOPWORDS: FrozenSet[str] = frozenset() _THAI_STOPWORDS_FILENAME = "stopwords_th.txt" -_THAI_NEGATIONS = set() +_THAI_NEGATIONS: FrozenSet[str] = frozenset() _THAI_NEGATIONS_FILENAME = "negations_th.txt" -_THAI_FAMLIY_NAMES = set() +_THAI_FAMLIY_NAMES: FrozenSet[str] = frozenset() _THAI_FAMLIY_NAMES_FILENAME = "family_names_th.txt" -_THAI_FEMALE_NAMES = set() +_THAI_FEMALE_NAMES: FrozenSet[str] = frozenset() _THAI_FEMALE_NAMES_FILENAME = "person_names_female_th.txt" -_THAI_MALE_NAMES = set() +_THAI_MALE_NAMES: FrozenSet[str] = frozenset() _THAI_MALE_NAMES_FILENAME = "person_names_male_th.txt" -_THAI_ORST_WORDS = set() +_THAI_ORST_WORDS: FrozenSet[str] = frozenset() _THAI_DICT = {} _THAI_WSD_DICT = {} -_THAI_SYNONYM = None +_THAI_SYNONYMS = {} def countries() -> FrozenSet[str]: @@ -74,7 +76,7 @@ def countries() -> FrozenSet[str]: return _THAI_COUNTRIES -def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]: +def provinces(details: bool = False) -> Union[FrozenSet[str], List[dict]]: """ Return a frozenset of Thailand province names in Thai such as "กระบี่", "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี". @@ -96,7 +98,7 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]: provs = set() prov_details = [] - for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True): + for line in get_corpus_as_is(_THAI_THAILAND_PROVINCES_FILENAME): p = line.split(",") prov = {} @@ -155,14 +157,14 @@ def thai_orst_words() -> FrozenSet[str]: """ Return a frozenset of Thai words from Royal Society of Thailand \n(See: `dev/pythainlp/corpus/thai_orst_words.txt\ - `_) + `_) :return: :class:`frozenset` containing words in the Thai language. :rtype: :class:`frozenset` """ global _THAI_ORST_WORDS if not _THAI_ORST_WORDS: - _THAI_ORST_WORDS = get_corpus("thai_orst_words.txt") + _THAI_ORST_WORDS = get_corpus("orst_words_th.txt") return _THAI_ORST_WORDS @@ -266,8 +268,11 @@ def thai_dict() -> dict: global _THAI_DICT if not _THAI_DICT: import csv - _THAI_DICT = {"word":[], "meaning":[]} - with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: + + _THAI_DICT = {"word": [], "meaning": []} + with open( + get_corpus_path("thai_dict"), newline="\n", encoding="utf-8" + ) as csvfile: reader = csv.DictReader(csvfile, delimiter=",") for row in reader: _THAI_DICT["word"].append(row["word"]) @@ -288,38 +293,46 @@ def thai_wsd_dict() -> dict: global _THAI_WSD_DICT if not _THAI_WSD_DICT: _thai_wsd = thai_dict() - _THAI_WSD_DICT = {"word":[],"meaning":[]} - for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): + _THAI_WSD_DICT = {"word": [], "meaning": []} + for i, j in zip(_thai_wsd["word"], _thai_wsd["meaning"]): _all_value = list(eval(j).values()) _use = [] for k in _all_value: _use.extend(k) - _use=list(set(_use)) - if len(_use)>1: + _use = list(set(_use)) + if len(_use) > 1: _THAI_WSD_DICT["word"].append(i) _THAI_WSD_DICT["meaning"].append(_use) return _THAI_WSD_DICT -def thai_synonym() -> dict: +def thai_synonyms() -> dict: """ - Return Thai synonym. + Return Thai synonyms. \n(See: `thai_synonym\ `_) :return: Thai words with part-of-speech type and synonym :rtype: dict """ - global _THAI_SYNONYM - if _THAI_SYNONYM is None: + global _THAI_SYNONYMS + if not _THAI_SYNONYMS: import csv - _THAI_SYNONYM = {"word":[], "pos":[], "synonym":[]} - with open(get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8") as csvfile: + + _THAI_SYNONYMS = {"word": [], "pos": [], "synonym": []} + with open( + get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8" + ) as csvfile: reader = csv.DictReader(csvfile, delimiter=",") for row in reader: - _THAI_SYNONYM["word"].append(row["word"]) - _THAI_SYNONYM["pos"].append(row["pos"]) - _THAI_SYNONYM["synonym"].append(row["synonym"].split("|")) + _THAI_SYNONYMS["word"].append(row["word"]) + _THAI_SYNONYMS["pos"].append(row["pos"]) + _THAI_SYNONYMS["synonym"].append(row["synonym"].split("|")) + + return _THAI_SYNONYMS - return _THAI_SYNONYM + +def thai_synonym() -> dict: + warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning) + return thai_synonyms() diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 50b4004e6..9e73106dc 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -35,7 +35,7 @@ def get_corpus_db(url: str): return corpus_db -def get_corpus_db_detail(name: str, version: str = None) -> dict: +def get_corpus_db_detail(name: str, version: str = '') -> dict: """ Get details about a corpus, using information from local catalog. @@ -46,7 +46,7 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict: with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: local_db = json.load(f) - if version is None: + if not version: for corpus in local_db["_default"].values(): if corpus["name"] == name: return corpus @@ -70,30 +70,22 @@ def path_pythainlp_corpus(filename: str) -> str: return os.path.join(corpus_path(), filename) -def get_corpus(filename: str, - as_is: bool = False, - comments: bool = True - ) -> Union[frozenset, list]: +def get_corpus(filename: str, comments: bool = True) -> frozenset: """ - Read corpus data from file and return a frozenset or a list. + Read corpus data from file and return a frozenset. - Each line in the file will be a member of the set or the list. + Each line in the file will be a member of the set. - By default, a frozenset will be return, with whitespace stripped and - empty values and duplicates removed. - - If as_is is True, a list will be return, with no modifications - in member values and their orders. + Whitespace stripped and empty values and duplicates removed. If comments is False, any text at any position after the character '#' in each line will be discarded. :param str filename: filename of the corpus to be read - :param bool as_is: no modification to the text, and return a list :param bool comments: keep comments - :return: :class:`frozenset` or :class:`list` consisting of lines in the file - :rtype: :class:`frozenset` or :class:`list` + :return: :class:`frozenset` consisting of lines in the file + :rtype: :class:`frozenset` :Example: :: @@ -108,10 +100,6 @@ def get_corpus(filename: str, # output: # frozenset({'แต่', 'ไม่'}) - get_corpus("negations_th.txt", as_is=True) - # output: - # ['แต่', 'ไม่'] - # input file (ttc_freq.txt): # ตัวบท10 # โดยนัยนี้1 @@ -147,18 +135,49 @@ def get_corpus(filename: str, lines = fh.read().splitlines() if not comments: - # take only text before character '#' - lines = [line.split("#", 1)[0] for line in lines] + # if the line has a '#' character, take only text before the first '#' + lines = [line.split("#", 1)[0].strip() for line in lines] - if as_is: - return lines + return frozenset(filter(None, lines)) - lines = [line.strip() for line in lines] - return frozenset(filter(None, lines)) +def get_corpus_as_is(filename: str) -> list: + """ + Read corpus data from file, as it is, and return a list. + + Each line in the file will be a member of the list. + + No modifications in member values and their orders. + + If strip or comment removal is needed, use get_corpus() instead. + + :param str filename: filename of the corpus to be read + + :return: :class:`list` consisting of lines in the file + :rtype: :class:`list` + + :Example: + :: + + from pythainlp.corpus import get_corpus + # input file (negations_th.txt): + # แต่ + # ไม่ -def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]: + get_corpus_as_is("negations_th.txt") + # output: + # ['แต่', 'ไม่'] + """ + path = path_pythainlp_corpus(filename) + lines = [] + with open(path, "r", encoding="utf-8-sig") as fh: + lines = fh.read().splitlines() + + return lines + + +def get_corpus_default_db(name: str, version: str = '') -> Union[str, None]: """ Get model path from default_db.json @@ -179,15 +198,17 @@ def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]: return path_pythainlp_corpus( corpus_db[name]["versions"][version]["filename"] ) - elif version is None: # load latest version + elif not version: # load latest version version = corpus_db[name]["latest_version"] return path_pythainlp_corpus( corpus_db[name]["versions"][version]["filename"] ) + return None + def get_corpus_path( - name: str, version: str = None, force: bool = False + name: str, version: str = '', force: bool = False ) -> Union[str, None]: """ Get corpus path. @@ -229,8 +250,9 @@ def get_corpus_path( print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ - # Customize your corpus path then close the line from lines 164 through 190. - _CUSTOMIZE = { + from typing import Dict + + _CUSTOMIZE: Dict[str, str] = { # "the corpus name":"path" } if name in list(_CUSTOMIZE): @@ -379,7 +401,7 @@ def _check_version(cause: str) -> bool: def download( - name: str, force: bool = False, url: str = None, version: str = None + name: str, force: bool = False, url: str = '', version: str = '' ) -> bool: """ Download corpus. @@ -430,7 +452,7 @@ def download( corpus = corpus_db[name] print("Corpus:", name) - if version is None: + if not version: for v, file in corpus["versions"].items(): if _check_version(file["pythainlp_version"]): version = v @@ -439,10 +461,7 @@ def download( if version not in corpus["versions"]: print("Not found corpus") return False - elif ( - _check_version(corpus["versions"][version]["pythainlp_version"]) - is False - ): + elif _check_version(corpus["versions"][version]["pythainlp_version"]) is False: print("Versions Corpus not support") return False corpus_versions = corpus["versions"][version] @@ -486,9 +505,7 @@ def download( foldername = name + "_" + str(version) if not os.path.exists(get_full_data_path(foldername)): os.mkdir(get_full_data_path(foldername)) - with zipfile.ZipFile( - get_full_data_path(file_name), "r" - ) as zip: + with zipfile.ZipFile(get_full_data_path(file_name), "r") as zip: zip.extractall(path=get_full_data_path(foldername)) if found: @@ -500,9 +517,7 @@ def download( # This awkward behavior is for backward-compatibility with # database files generated previously using TinyDB if local_db["_default"]: - corpus_no = ( - max((int(no) for no in local_db["_default"])) + 1 - ) + corpus_no = max((int(no) for no in local_db["_default"])) + 1 else: corpus_no = 1 local_db["_default"][str(corpus_no)] = { @@ -565,9 +580,7 @@ def remove(name: str) -> bool: return False with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: db = json.load(f) - data = [ - corpus for corpus in db["_default"].values() if corpus["name"] == name - ] + data = [corpus for corpus in db["_default"].values() if corpus["name"] == name] if data: path = get_corpus_path(name) diff --git a/pythainlp/corpus/icu.py b/pythainlp/corpus/icu.py index 71a002d4e..e7d0198b7 100644 --- a/pythainlp/corpus/icu.py +++ b/pythainlp/corpus/icu.py @@ -17,7 +17,7 @@ def thai_icu_words() -> FrozenSet[str]: Return a frozenset of words from the Thai dictionary for BreakIterator of the International Components for Unicode (ICU). - :return: :class:`frozenset` containing `str` + :return: :class:`frozenset` containing Thai words. :rtype: :class:`frozenset` """ diff --git a/pythainlp/corpus/thai_orst_words.txt b/pythainlp/corpus/orst_words_th.txt similarity index 100% rename from pythainlp/corpus/thai_orst_words.txt rename to pythainlp/corpus/orst_words_th.txt diff --git a/pythainlp/corpus/volubilis.py b/pythainlp/corpus/volubilis.py index b422fc30c..38906f125 100644 --- a/pythainlp/corpus/volubilis.py +++ b/pythainlp/corpus/volubilis.py @@ -8,25 +8,25 @@ from pythainlp.corpus.common import get_corpus -_VOLUBILIS = None -_VOLUBILIS_FILENAME = "volubilis_modified.txt" +_VOLUBILIS_WORDS = None +_VOLUBILIS_FILENAME = "volubilis_words_th.txt" -def volubilis() -> FrozenSet[str]: +def thai_volubilis_words() -> FrozenSet[str]: """ - Return a frozenset of words from the Volubilis dictionary. + Return a frozenset of Thai words from the Volubilis dictionary + + See: `dev/pythainlp/corpus/volubilis_words_th.txt\ + `_ - The data is at pythainlp/corpus/volubilis_modified.txt - The word list has beed prepared by the code at: - https://github.com/konbraphat51/Thai_Dictionary_Cleaner - Based Volubilis dictionary 23.1 (March 2023): - https://belisan-volubilis.blogspot.com/ + More info: + https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md - :return: :class:`frozenset` containing words in the Volubilis dictionary. + :return: :class:`frozenset` containing Thai words. :rtype: :class:`frozenset` """ - global _VOLUBILIS - if not _VOLUBILIS: - _VOLUBILIS = get_corpus(_VOLUBILIS_FILENAME) + global _VOLUBILIS_WORDS + if not _VOLUBILIS_WORDS: + _VOLUBILIS_WORDS = get_corpus(_VOLUBILIS_FILENAME, comments=False) - return _VOLUBILIS + return _VOLUBILIS_WORDS diff --git a/pythainlp/corpus/volubilis_modified.txt b/pythainlp/corpus/volubilis_words_th.txt similarity index 99% rename from pythainlp/corpus/volubilis_modified.txt rename to pythainlp/corpus/volubilis_words_th.txt index 1d741eabd..39cb66495 100644 --- a/pythainlp/corpus/volubilis_modified.txt +++ b/pythainlp/corpus/volubilis_words_th.txt @@ -1,3 +1,5 @@ +# Thai words from Volubilis dictionary +# SPDX-License-Identifier: CC-BY-SA-4.0 อ๊ะ อา อา diff --git a/pythainlp/corpus/wikipedia_titles.py b/pythainlp/corpus/wikipedia.py similarity index 71% rename from pythainlp/corpus/wikipedia_titles.py rename to pythainlp/corpus/wikipedia.py index a94c54022..abe39f10c 100644 --- a/pythainlp/corpus/wikipedia_titles.py +++ b/pythainlp/corpus/wikipedia.py @@ -9,24 +9,27 @@ from pythainlp.corpus.common import get_corpus _WIKIPEDIA_TITLES = None -_WIKIPEDIA_TITLES_FILENAME = "wikipedia_titles.txt" +_WIKIPEDIA_TITLES_FILENAME = "wikipedia_titles_th.txt" -def wikipedia_titles() -> FrozenSet[str]: +def thai_wikipedia_titles() -> FrozenSet[str]: """ Return a frozenset of words from Thai Wikipedia titles corpus. They are mostly nouns and noun phrases, including event, organization, people, place, and product names. Commonly misspelled words are included intentionally. + See: `dev/pythainlp/corpus/wikipedia_titles_th.txt\ + `_ + More info: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md - :return: :class:`frozenset` containing words in Thai Wikipedia titles. + :return: :class:`frozenset` containing Thai words. :rtype: :class:`frozenset` """ global _WIKIPEDIA_TITLES if not _WIKIPEDIA_TITLES: - _WIKIPEDIA_TITLES = get_corpus(_WIKIPEDIA_TITLES_FILENAME) + _WIKIPEDIA_TITLES = get_corpus(_WIKIPEDIA_TITLES_FILENAME, comments=False) return _WIKIPEDIA_TITLES diff --git a/pythainlp/corpus/wikipedia_titles.txt b/pythainlp/corpus/wikipedia_titles_th.txt similarity index 99% rename from pythainlp/corpus/wikipedia_titles.txt rename to pythainlp/corpus/wikipedia_titles_th.txt index 8d712413e..cb24f407c 100644 --- a/pythainlp/corpus/wikipedia_titles.txt +++ b/pythainlp/corpus/wikipedia_titles_th.txt @@ -1,3 +1,5 @@ +# Titles from Thai Wikipedia +# SPDX-License-Identifier: CC-BY-SA-4.0 ปากหวาน ทวิชกลิ่นประทุม รถไฟฟ้าเซี่ยงไฮ้