From cc228c2f6e65537b604b98a4c1d9bf801cbc45ae Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 5 Sep 2023 11:27:49 +0700 Subject: [PATCH 01/33] Update macos-test.yml --- .github/workflows/macos-test.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index cf1a86894..8beac057c 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -49,13 +49,14 @@ jobs: conda info conda list python -m pip install --upgrade pip - pip uninstall --y pythainlp - pip install --no-deps fastai==1.0.61 + pip install pytest coverage coveralls conda install -c conda-forge icu conda install -c conda-forge pyicu SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt + pip install deepcut tltk pip install .[full] python -m nltk.downloader omw-1.4 + python -m pip cache purge python -m unittest discover if: matrix.os == 'self-hosted' - shell: bash -l {0} From 60b662e1c3626b41bdb0362cdd4e5d06e19860fc Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 5 Sep 2023 11:28:38 +0700 Subject: [PATCH 02/33] PyThaiNLP v4.1.0beta4 --- pythainlp/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index e83cc7f9f..855c5f0f1 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -17,7 +17,7 @@ # # URL: # For license information, see LICENSE -__version__ = "4.1.0beta3" +__version__ = "4.1.0beta4" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/setup.cfg b/setup.cfg index 2bf56e01a..577a7b890 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.0beta3 +current_version = 4.1.0beta4 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 07def19e3..99304e68f 100644 --- a/setup.py +++ b/setup.py @@ -171,7 +171,7 @@ setup( name="pythainlp", - version="4.1.0beta3", + version="4.1.0beta4", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From 99b575109f1df7d1b869b6fc50cbfc4e895d19af Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Fri, 22 Sep 2023 18:31:21 +0800 Subject: [PATCH 03/33] Fix "List of possible extras" in README --- README.md | 4 ++-- README_TH.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1ff55683e..786065e35 100644 --- a/README.md +++ b/README.md @@ -72,14 +72,14 @@ Install different releases: ### Installation Options -Some functionalities, like Thai WordNet, may require extra packages. To install those requirements, specify a set of `[name]` immediately after `pythainlp`: +Some functionalities, like Thai WordNet, may require extra packages. To install those requirements, specify a set of `[name]` immediately after `pythainlp`: ```sh pip install pythainlp[extra1,extra2,...] ```
- List of possible `extras` + List of possible extras - `full` (install everything) - `attacut` (to support attacut, a fast and accurate tokenizer) diff --git a/README_TH.md b/README_TH.md index 69597debf..9c37d20d4 100644 --- a/README_TH.md +++ b/README_TH.md @@ -75,14 +75,14 @@ PyThaiNLP ใช้ pip สำหรับจัดการโมดูลแ ### ตัวเลือกการติดตั้ง -บางความสามารถ เช่น Thai WordNet ต้องการโมดูลภายนอกในการทำงานนอกจาก PyThaiNLP ซึ่งในตอนติดตั้ง คุณจะต้องติดตั้งส่วนขยายพิเศษที่จำเป็นหรือ "extras" โดยระบุชื่อลงใน `[name]` ต่อท้าย `pythainlp`: +บางความสามารถ เช่น Thai WordNet ต้องการโมดูลภายนอกในการทำงานนอกจาก PyThaiNLP ซึ่งในตอนติดตั้ง คุณจะต้องติดตั้งส่วนขยายพิเศษที่จำเป็นหรือ "extras" โดยระบุชื่อลงใน `[name]` ต่อท้าย `pythainlp`: ```sh pip install pythainlp[extra1,extra2,...] ```
- รายการสำหรับติดตั้งผ่าน `extras` + รายการสำหรับติดตั้งผ่าน extras - `full` (ติดตั้งทุกอย่าง) - `attacut` (เพื่อสนับสนุน attacut ซึ่งเป็นตัวตัดคำที่ทำงานได้รวดเร็วและมีประสิทธิภาพ) From e9f2f32d8f33f2a6a2e01a7c61a8473427fd7466 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sun, 24 Sep 2023 13:09:30 +0800 Subject: [PATCH 04/33] Add tzdata as a dependency on Windows --- pythainlp/util/date.py | 2 +- setup.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index e9ad33c49..7185d0d2c 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -97,7 +97,7 @@ ["ธันวาคม", "ธันวา", "ธ.ค.", "12"] ] thai_full_month_lists_regex = "(" + '|'.join( - [str('|'.join([j for j in i])) for i in thai_full_month_lists] + ['|'.join(i) for i in thai_full_month_lists] ) + ")" year_all_regex = r"(\d\d\d\d|\d\d)" dates_list = "(" + '|'.join( diff --git a/setup.py b/setup.py index 99304e68f..1a5cf2d3b 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,8 @@ requirements = [ "requests>=2.22.0", - "backports.zoneinfo; python_version<'3.9'" + "backports.zoneinfo; python_version<'3.9'", + "tzdata; sys_platform == 'win32'" ] extras = { From fcf567ccbaf8fe40dfb342ba70158f6cdbc7f538 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 24 Sep 2023 15:56:38 +0700 Subject: [PATCH 05/33] PyThaiNLP v4.1.0beta5 --- pythainlp/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 855c5f0f1..7a61fea7d 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -17,7 +17,7 @@ # # URL: # For license information, see LICENSE -__version__ = "4.1.0beta4" +__version__ = "4.1.0beta5" thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars diff --git a/setup.cfg b/setup.cfg index 577a7b890..69349cb6d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.0beta4 +current_version = 4.1.0beta5 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 1a5cf2d3b..30e7dd832 100644 --- a/setup.py +++ b/setup.py @@ -172,7 +172,7 @@ setup( name="pythainlp", - version="4.1.0beta4", + version="4.1.0beta5", description="Thai Natural Language Processing library", long_description=readme, long_description_content_type="text/markdown", From db6272f588cbf1265c3ddd2e6c08c612c2006efa Mon Sep 17 00:00:00 2001 From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:03:07 +0700 Subject: [PATCH 06/33] Update wtsplit.py add segmentation style --- pythainlp/tokenize/wtsplit.py | 41 ++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 6364aeffa..b02d4199c 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -30,6 +30,7 @@ def _tokenize( model:str="wtp-bert-mini", tokenize:str="sentence", paragraph_threshold:float=0.5, + style:str='newline', )-> List[str]: global _MODEL_NAME,_MODEL if _MODEL_NAME != model: @@ -38,15 +39,33 @@ def _tokenize( if tokenize=="sentence": return _MODEL.split(text,lang_code=lang_code) else: # Paragraph - return _MODEL.split( - text, - lang_code=lang_code, - do_paragraph_segmentation=True, - paragraph_threshold=paragraph_threshold + if style=='newline': + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True, + paragraph_threshold=paragraph_threshold + ) + elif style=='opus100': + return _MODEL.split( + text, + lang_code=lang_code, + style=style, + threshold=paragraph_threshold, + ) + else: + raise ValueError( + f"""Segmentation style \"{style}\" not found. + It might be a typo; if not, please consult our document.""" ) - -def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]: +def tokenize( + text:str, + size:str="mini", + tokenize:str="sentence", + paragraph_threshold:float=0.5, + style:str='newline', + )-> List[str]: _model_load="" if size=="tiny": _model_load="wtp-bert-tiny" @@ -56,4 +75,10 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_thres _model_load="wtp-canine-s-12l" else: # mini _model_load="wtp-bert-mini" - return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold) + return _tokenize( + text, + model=_model_load, + tokenize=tokenize, + paragraph_threshold=paragraph_threshold, + style=style, + ) From 9bfd75478d0cd1504d3f08d4069e5c415a7633ec Mon Sep 17 00:00:00 2001 From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:04:34 +0700 Subject: [PATCH 07/33] Update core.py add segmentation style --- pythainlp/tokenize/core.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 23c08a472..ea33d00df 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -446,7 +446,12 @@ def sent_tokenize( return segments -def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]: +def paragraph_tokenize( + text: str, + engine: str = "wtp-mini", + paragraph_threshold:float=0.5, + style:str='newline', + ) -> List[List[str]]: """ Paragraph tokenizer. @@ -492,7 +497,13 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold: else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold) + segments = segment( + text, + size=_size, + tokenize="paragraph", + paragraph_threshold=paragraph_threshold, + style=style, + ) else: raise ValueError( From a0053c159e6b6813f50f5f64b4b5eb6fd9064927 Mon Sep 17 00:00:00 2001 From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:46:11 +0700 Subject: [PATCH 08/33] Update wtsplit.py fix segmentation style --- pythainlp/tokenize/wtsplit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index b02d4199c..2bcbd4183 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -50,8 +50,9 @@ def _tokenize( return _MODEL.split( text, lang_code=lang_code, - style=style, + do_paragraph_segmentation=True, threshold=paragraph_threshold, + style=style, ) else: raise ValueError( From c63e568bc3639e0e85e7f60c93fc006757a5d48f Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Mon, 16 Oct 2023 18:21:14 +0800 Subject: [PATCH 09/33] Update code comments and clean up codes --- CONTRIBUTING.md | 50 +++--- INTHEWILD.md | 4 +- README.md | 42 ++--- docs/api/tag.rst | 86 +++++----- docs/api/word_vector.rst | 2 +- docs/api/wsd.rst | 2 +- docs/clean_directory.sh | 2 +- docs/conf.py | 4 +- docs/index.rst | 2 +- pythainlp/__init__.py | 2 +- pythainlp/ancient/__init__.py | 2 +- pythainlp/ancient/aksonhan.py | 10 +- pythainlp/augment/lm/fasttext.py | 24 +-- pythainlp/augment/lm/wangchanberta.py | 6 +- pythainlp/augment/word2vec/bpemb_wv.py | 10 +- pythainlp/augment/word2vec/core.py | 10 +- pythainlp/augment/word2vec/ltw2v.py | 14 +- pythainlp/augment/word2vec/thai2fit.py | 14 +- pythainlp/augment/wordnet.py | 30 ++-- pythainlp/benchmarks/word_tokenization.py | 46 +++-- pythainlp/chat/core.py | 10 +- pythainlp/cli/__init__.py | 4 +- pythainlp/cli/benchmark.py | 6 +- pythainlp/cli/data.py | 6 +- pythainlp/cli/soundex.py | 5 +- pythainlp/cli/tag.py | 4 +- pythainlp/cli/tokenize.py | 3 +- pythainlp/cls/param_free.py | 8 +- pythainlp/coref/_fastcoref.py | 10 +- pythainlp/coref/core.py | 10 +- pythainlp/coref/han_coref.py | 4 +- pythainlp/corpus/__init__.py | 6 +- pythainlp/corpus/common.py | 34 ++-- pythainlp/corpus/conceptnet.py | 4 +- pythainlp/corpus/core.py | 70 ++++---- pythainlp/corpus/corpus_license.md | 6 +- pythainlp/corpus/oscar.py | 10 +- pythainlp/corpus/th_en_translit.py | 12 +- pythainlp/corpus/util.py | 18 +- pythainlp/corpus/wordnet.py | 66 ++++---- pythainlp/el/_multiel.py | 2 +- pythainlp/el/core.py | 4 +- pythainlp/generate/__init__.py | 2 +- pythainlp/generate/core.py | 48 +++--- pythainlp/generate/thai2fit.py | 42 ++--- pythainlp/generate/wangchanglm.py | 28 +-- pythainlp/khavee/core.py | 159 +++++++++--------- pythainlp/khavee/example.py | 26 +-- pythainlp/parse/core.py | 36 ++-- pythainlp/parse/esupar_engine.py | 4 +- pythainlp/parse/spacy_thai_engine.py | 2 +- pythainlp/parse/transformers_ud.py | 2 +- pythainlp/soundex/core.py | 2 +- pythainlp/soundex/lk82.py | 8 +- pythainlp/soundex/metasound.py | 4 +- pythainlp/soundex/prayut_and_somchaip.py | 2 +- pythainlp/soundex/sound.py | 15 +- pythainlp/spell/__init__.py | 2 +- pythainlp/spell/core.py | 28 +-- pythainlp/spell/phunspell.py | 2 +- pythainlp/spell/pn.py | 46 ++--- pythainlp/spell/symspellpy.py | 4 +- .../spell/wanchanberta_thai_grammarly.py | 15 +- pythainlp/summarize/core.py | 12 +- pythainlp/summarize/keybert.py | 4 +- pythainlp/summarize/mt5.py | 2 +- pythainlp/tag/_tag_perceptron.py | 2 +- pythainlp/tag/chunk.py | 6 +- pythainlp/tag/crfchunk.py | 4 +- pythainlp/tag/locations.py | 4 +- pythainlp/tag/named_entity.py | 31 ++-- pythainlp/tag/orchid.py | 2 +- pythainlp/tag/perceptron.py | 7 +- pythainlp/tag/pos_tag.py | 10 +- pythainlp/tag/thainer.py | 30 ++-- pythainlp/tag/tltk.py | 16 +- pythainlp/tag/unigram.py | 6 +- pythainlp/tokenize/__init__.py | 2 +- pythainlp/tokenize/_utils.py | 4 +- pythainlp/tokenize/attacut.py | 2 +- pythainlp/tokenize/core.py | 121 +++++++------ pythainlp/tokenize/crfcut.py | 6 +- pythainlp/tokenize/etcc.py | 4 +- pythainlp/tokenize/han_solo.py | 19 +-- pythainlp/tokenize/longest.py | 8 +- pythainlp/tokenize/multi_cut.py | 10 +- pythainlp/tokenize/nercut.py | 12 +- pythainlp/tokenize/newmm.py | 32 ++-- pythainlp/tokenize/nlpo3.py | 6 +- pythainlp/tokenize/pyicu.py | 4 +- pythainlp/tokenize/tcc.py | 14 +- pythainlp/tokenize/tcc_p.py | 16 +- pythainlp/tokenize/thaisumcut.py | 2 +- pythainlp/tokenize/wtsplit.py | 50 +++--- pythainlp/tools/misspell.py | 12 +- pythainlp/tools/path.py | 4 +- pythainlp/translate/core.py | 6 +- pythainlp/translate/en_th.py | 13 +- pythainlp/translate/small100.py | 4 +- pythainlp/translate/th_fr.py | 6 +- pythainlp/translate/tokenization_small100.py | 5 +- pythainlp/translate/zh_th.py | 4 +- pythainlp/transliterate/core.py | 2 +- pythainlp/transliterate/iso_11940.py | 2 +- pythainlp/transliterate/thai2rom.py | 12 +- pythainlp/transliterate/thai2rom_onnx.py | 8 +- pythainlp/transliterate/thaig2p.py | 13 +- pythainlp/transliterate/tltk.py | 2 +- pythainlp/transliterate/w2p.py | 9 +- pythainlp/transliterate/wunsen.py | 14 +- pythainlp/ulmfit/core.py | 14 +- pythainlp/ulmfit/preprocess.py | 34 ++-- pythainlp/ulmfit/tokenizer.py | 10 +- pythainlp/util/__init__.py | 3 +- pythainlp/util/abbreviation.py | 10 +- pythainlp/util/collate.py | 2 +- pythainlp/util/date.py | 18 +- pythainlp/util/digitconv.py | 24 +-- pythainlp/util/emojiconv.py | 6 +- pythainlp/util/encoding.py | 4 +- pythainlp/util/keyboard.py | 16 +- pythainlp/util/keywords.py | 20 +-- pythainlp/util/normalize.py | 8 +- pythainlp/util/numtoword.py | 2 +- pythainlp/util/phoneme.py | 16 +- pythainlp/util/spell_words.py | 56 +++--- pythainlp/util/strftime.py | 8 +- pythainlp/util/syllable.py | 40 ++--- pythainlp/util/thai.py | 20 +-- pythainlp/util/thaiwordcheck.py | 10 +- pythainlp/util/time.py | 24 +-- pythainlp/util/trie.py | 4 +- pythainlp/util/wordtonum.py | 4 +- pythainlp/wangchanberta/core.py | 34 ++-- pythainlp/word_vector/core.py | 36 ++-- pythainlp/wsd/__init__.py | 2 +- pythainlp/wsd/core.py | 26 ++- setup.py | 4 +- tests/test_ancient.py | 26 +-- tests/test_augment.py | 3 +- tests/test_benchmarks.py | 6 +- tests/test_coref.py | 2 +- tests/test_corpus.py | 6 +- tests/test_khavee.py | 7 +- tests/test_misspell.py | 2 +- tests/test_summarize.py | 2 +- tests/test_tag.py | 8 +- tests/test_tokenize.py | 4 +- tests/test_translate.py | 8 +- tests/test_transliterate.py | 3 +- tests/test_ulmfit.py | 42 ++--- tests/test_util.py | 12 +- tests/test_wangchanberta.py | 1 - tokenization-benchmark.md | 12 +- 154 files changed, 1126 insertions(+), 1178 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 159a576b6..451bd8a14 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,28 +7,28 @@ Please refer to our [Contributor Covenant Code of Conduct](https://github.com/Py ## Issue Report and Discussion - Discussion: https://github.com/PyThaiNLP/pythainlp/discussions -- GitHub issues (problems and suggestions): https://github.com/PyThaiNLP/pythainlp/issues -- Facebook group (not specific to PyThaiNLP, can be Thai NLP discussion in general): https://www.facebook.com/groups/thainlp +- GitHub issues (for problems and suggestions): https://github.com/PyThaiNLP/pythainlp/issues +- Facebook group (not specific to PyThaiNLP, for Thai NLP discussion in general): https://www.facebook.com/groups/thainlp ## Code ## Code Guidelines -- Follows [PEP8](http://www.python.org/dev/peps/pep-0008/), use [black](https://github.com/ambv/black) with `--line-length` = 79; +- Follow [PEP8](http://www.python.org/dev/peps/pep-0008/), use [black](https://github.com/ambv/black) with `--line-length` = 79; - Name identifiers (variables, classes, functions, module names) with meaningful and pronounceable names (`x` is always wrong); - Please follow this [naming convention](https://namingconvention.org/python/). For example, global constant variables must be in `ALL_CAPS`; -- Write tests for your new features. Test suites are in `tests/` directory. (see "Testing" section below); +- Write tests for your new features. The test suite is in `tests/` directory. (see "Testing" section below); - Run all tests before pushing (just execute `tox`) so you will know if your changes broke something; -- Commented code is [dead - code](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); +- Commented out codes are [dead + codes](http://www.codinghorror.com/blog/2008/07/coding-without-comments.html); - All `#TODO` comments should be turned into [issues](https://github.com/pythainlp/pythainlp/issues) in GitHub; -- When appropriate, use [f-String](https://www.python.org/dev/peps/pep-0498/) +- When appropriate, use [f-string](https://www.python.org/dev/peps/pep-0498/) (use `f"{a} = {b}"`, instead of `"{} = {}".format(a, b)` and `"%s = %s' % (a, b)"`); -- All text files, including source code, must be ended with one empty line. This is [to please git](https://stackoverflow.com/questions/5813311/no-newline-at-end-of-file#5813359) and [to keep up with POSIX standard](https://stackoverflow.com/questions/729692/why-should-text-files-end-with-a-newline). +- All text files, including source codes, must end with one empty line. This is [to please git](https://stackoverflow.com/questions/5813311/no-newline-at-end-of-file#5813359) and [to keep up with POSIX standard](https://stackoverflow.com/questions/729692/why-should-text-files-end-with-a-newline). ### Version Control System @@ -36,7 +36,7 @@ Please refer to our [Contributor Covenant Code of Conduct](https://github.com/Py so it may be a good idea to familiarize yourself with it. - You can start with the [Pro Git book](http://git-scm.com/book/) (free!). -### Commit Comment +### Commit Message - [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/) - [Commit Verbs 101: why I like to use this and why you should also like it.](https://chris.beams.io/posts/git-commit/) @@ -45,7 +45,7 @@ so it may be a good idea to familiarize yourself with it. - We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-model/) to manage our branches. -- When you do pull request on GitHub, Travis CI and AppVeyor will run tests +- When you create pull requests on GitHub, Github Actions and AppVeyor will run tests and several checks automatically. Click the "Details" link at the end of each check to see what needs to be fixed. @@ -53,16 +53,16 @@ each check to see what needs to be fixed. ## Documentation - We use [Sphinx](https://www.sphinx-doc.org/en/master/) to generate API document -automatically from "docstring" comments in source code. This means the comment -section in the source code is important for the quality of documentation. -- A docstring should start with one summary line, ended the line with a full stop (period), -then followed by a blank line before the start new paragraph. -- A commit to release branches (e.g. `2.2`, `2.1`) with a title **"(build and deploy docs)"** (without quotes) will trigger the system to rebuild the documentation files and upload them to the website https://pythainlp.github.io/docs +automatically from "docstring" comments in source codes. This means the comment +section in the source codes is important for the quality of documentation. +- A docstring should start with one summary line, end with one line with a full stop (period), +then be followed by a blank line before starting a new paragraph. +- A commit to release branches (e.g. `2.2`, `2.1`) with a title **"(build and deploy docs)"** (without quotes) will trigger the system to rebuild the documentation files and upload them to the website https://pythainlp.github.io/docs. ## Testing -We use standard Python `unittest`. Test suites are in `tests/` directory. +We use standard Python `unittest`. The test suite is in `tests/` directory. To run unit tests locally together with code coverage test: @@ -81,12 +81,12 @@ Generate code coverage test in HTML (files will be available in `htmlcov/` direc coverage html ``` -Make sure the same tests pass on Travis CI and AppVeyor. +Make sure the tests pass on both Github Actions and AppVeyor. ## Releasing - We use [semantic versioning](https://semver.org/): MAJOR.MINOR.PATCH, with development build suffix: MAJOR.MINOR.PATCH-devBUILD -- Use [`bumpversion`](https://github.com/c4urself/bump2version/#installation) to manage versioning. +- We use [`bumpversion`](https://github.com/c4urself/bump2version/#installation) to manage versioning. - `bumpversion [major|minor|patch|release|build]` - Example: ``` @@ -129,18 +129,18 @@ Make sure the same tests pass on Travis CI and AppVeyor. -Thanks all the [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contributors). (Image made with [contributors-img](https://contributors-img.firebaseapp.com)) +Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contributors). (Image made with [contributors-img](https://contributors-img.firebaseapp.com)) -### Development Lead -- Wannaphong Phatthiyaphaibun - founder, distribution and maintainance -- Korakot Chaovavanich - initial tokenization and soundex code +### Development Leads +- Wannaphong Phatthiyaphaibun - foundation, distribution and maintenance +- Korakot Chaovavanich - initial tokenization and soundex codes - Charin Polpanumas - classification and benchmarking - Peeradej Tanruangporn - documentation -- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintainance +- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance - Chakri Lowphansirikul - documentation - Pattarawat Chormai - benchmarking -- Thanathip Suntorntip - nlpO3 maintainance, Rust Developer -- Can Udomcharoenchaikit - documentation and code +- Thanathip Suntorntip - nlpO3 maintenance, Rust Developer +- Can Udomcharoenchaikit - documentation and codes ### Maintainers - Arthit Suriyawongkul diff --git a/INTHEWILD.md b/INTHEWILD.md index b39566cd9..b0479806b 100644 --- a/INTHEWILD.md +++ b/INTHEWILD.md @@ -1,8 +1,8 @@ # Who uses PyThaiNLP? -We'd like to keep track of who is using the package. Please send a PR with your company name or @githubhandle or company name with @githubhandle. +We'd like to keep track of who are using the package. Please send a PR with your company name or @githubhandle or both company name and @githubhandle. -Currently, officially using PyThaiNLP: +Currently, those who are officially using PyThaiNLP are as follows: 1. [Hope Data Annotations Co., Ltd.](https://hopedata.org) ([@hopedataannotations](https://github.com/hopedataannotaions)) 2. [Codustry (Thailand) Co., Ltd.](https://codustry.com) ([@codustry](https://github.com/codustry)) diff --git a/README.md b/README.md index 786065e35..6b4dd4498 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,13 @@ Chat on Matrix -PyThaiNLP is a Python package for text processing and linguistic analysis, similar to [NLTK](https://www.nltk.org/) with focus on Thai language. +PyThaiNLP is a Python package for text processing and linguistic analysis, similar to [NLTK](https://www.nltk.org/) with a focus on the Thai language. PyThaiNLP เป็นไลบารีภาษาไพทอนสำหรับประมวลผลภาษาธรรมชาติ คล้ายกับ NLTK โดยเน้นภาษาไทย [ดูรายละเอียดภาษาไทยได้ที่ README_TH.MD](https://github.com/PyThaiNLP/pythainlp/blob/dev/README_TH.md) **News** -> Now, You can contact or ask any questions with the PyThaiNLP team. Chat on Matrix +> Now, You can contact with or ask any questions of the PyThaiNLP team. Chat on Matrix | Version | Description | Status | |:------:|:--:|:------:| @@ -37,7 +37,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร ## Capabilities -PyThaiNLP provides standard NLP functions for Thai, for example part-of-speech tagging, linguistic unit segmentation (syllable, word, or sentence). Some of these functions are also available via command-line interface. +PyThaiNLP provides standard NLP functions for Thai, for example part-of-speech tagging, linguistic unit segmentation (syllable, word, or sentence). Some of these functions are also available via the command-line interface.
List of Features @@ -48,11 +48,11 @@ PyThaiNLP provides standard NLP functions for Thai, for example part-of-speech t - Thai spelling suggestion and correction (`spell` and `correct`) - Thai transliteration (`transliterate`) - Thai soundex (`soundex`) with three engines (`lk82`, `udom83`, `metasound`) -- Thai collation (sort by dictionary order) (`collate`) +- Thai collation (sorted by dictionary order) (`collate`) - Read out number to Thai words (`bahttext`, `num_to_thaiword`) - Thai datetime formatting (`thai_strftime`) - Thai-English keyboard misswitched fix (`eng_to_thai`, `thai_to_eng`) -- Command-line interface for basic functions, like tokenization and pos tagging (run `thainlp` in your shell) +- Command-line interface for basic functions, like tokenization and POS tagging (run `thainlp` in your shell)
@@ -67,7 +67,7 @@ This will install the latest stable release of PyThaiNLP. Install different releases: - Stable release: `pip install --upgrade pythainlp` -- Pre-release (near ready): `pip install --upgrade --pre pythainlp` +- Pre-release (nearly ready): `pip install --upgrade --pre pythainlp` - Development (likely to break things): `pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip` ### Installation Options @@ -92,27 +92,27 @@ pip install pythainlp[extra1,extra2,...] - `wordnet` (for Thai WordNet API)
-For dependency details, look at `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py). +For dependency details, look at the `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py). -## Data directory +## Data Directory -- Some additional data, like word lists and language models, may get automatically download during runtime. +- Some additional data, like word lists and language models, may be automatically downloaded during runtime. - PyThaiNLP caches these data under the directory `~/pythainlp-data` by default. -- Data directory can be changed by specifying the environment variable `PYTHAINLP_DATA_DIR`. +- The data directory can be changed by specifying the environment variable `PYTHAINLP_DATA_DIR`. - See the data catalog (`db.json`) at https://github.com/PyThaiNLP/pythainlp-corpus ## Command-Line Interface -Some of PyThaiNLP functionalities can be used at command line, using `thainlp` command. +Some of PyThaiNLP functionalities can be used via command line with the `thainlp` command. -For example, displaying a catalog of datasets: +For example, to display a catalog of datasets: ```sh thainlp data catalog ``` -Showing how to use: +To show how to use: ```sh thainlp help ``` @@ -122,16 +122,16 @@ thainlp help | | License | |:---|:----| -| PyThaiNLP Source Code and Notebooks | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) | +| PyThaiNLP source codes and notebooks | [Apache Software License 2.0](https://github.com/PyThaiNLP/pythainlp/blob/dev/LICENSE) | | Corpora, datasets, and documentations created by PyThaiNLP | [Creative Commons Zero 1.0 Universal Public Domain Dedication License (CC0)](https://creativecommons.org/publicdomain/zero/1.0/)| | Language models created by PyThaiNLP | [Creative Commons Attribution 4.0 International Public License (CC-by)](https://creativecommons.org/licenses/by/4.0/) | -| Other corpora and models that may included with PyThaiNLP | See [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) | +| Other corpora and models that may be included in PyThaiNLP | See [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md) | ## Contribute to PyThaiNLP -- Please do fork and create a pull request :) -- For style guide and other information, including references to algorithms we use, please refer to our [contributing](https://github.com/PyThaiNLP/pythainlp/blob/dev/CONTRIBUTING.md) page. +- Please fork and create a pull request :) +- For style guides and other information, including references to algorithms we use, please refer to our [contributing](https://github.com/PyThaiNLP/pythainlp/blob/dev/CONTRIBUTING.md) page. ## Who uses PyThaiNLP? @@ -140,13 +140,13 @@ You can read [INTHEWILD.md](https://github.com/PyThaiNLP/pythainlp/blob/dev/INTH ## Citations -If you use `PyThaiNLP` in your project or publication, please cite the library as follows +If you use `PyThaiNLP` in your project or publication, please cite the library as follows: ``` Wannaphong Phatthiyaphaibun, Korakot Chaovavanich, Charin Polpanumas, Arthit Suriyawongkul, Lalita Lowphansirikul, & Pattarawat Chormai. (2016, Jun 27). PyThaiNLP: Thai Natural Language Processing in Python. Zenodo. http://doi.org/10.5281/zenodo.3519354 ``` -or BibTeX entry: +or by BibTeX entry: ``` bib @misc{pythainlp, @@ -166,7 +166,7 @@ or BibTeX entry: | Logo | Description | | --- | ----------- | | [![VISTEC-depa Thailand Artificial Intelligence Research Institute](https://airesearch.in.th/assets/img/logo/airesearch-logo.svg)](https://airesearch.in.th/) | Since 2019, our contributors Korakot Chaovavanich and Lalita Lowphansirikul have been supported by [VISTEC-depa Thailand Artificial Intelligence Research Institute](https://airesearch.in.th/). | -| [![MacStadium](https://i.imgur.com/rKy1dJX.png)](https://www.macstadium.com) | We get support free Mac Mini M1 from [MacStadium](https://www.macstadium.com) for doing Build CI. | +| [![MacStadium](https://i.imgur.com/rKy1dJX.png)](https://www.macstadium.com) | We get support of free Mac Mini M1 from [MacStadium](https://www.macstadium.com) for running CI builds. | ------ @@ -181,5 +181,5 @@ or BibTeX entry:
- Beware of malware if you use code from mirrors other than the official two at GitHub and GitLab. + Beware of malware if you use codes from mirrors other than the official two on GitHub and GitLab.
diff --git a/docs/api/tag.rst b/docs/api/tag.rst index e8769a225..c7d471037 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -3,9 +3,9 @@ pythainlp.tag ===================================== The :class:`pythainlp.tag` contains functions that are used to mark linguistic and other annotation to different parts of a text including -part-of-speech (POS) tag and named entity (NE) tag. +part-of-speech (POS) tags and named entity (NE) tags. -For POS tags, there are three set of available tags: `Universal POS tags `_, ORCHID POS tags [#Sornlertlamvanich_2000]_, and LST20 POS tags [#Prachya_2020]_. +For POS tags, there are three sets of available tags: `Universal POS tags `_, ORCHID POS tags [#Sornlertlamvanich_2000]_, and LST20 POS tags [#Prachya_2020]_. The following table shows Universal POS tags as used in Universal Dependencies (UD): @@ -91,13 +91,13 @@ Abbreviation Part-of-Speech tag Examples PUNC Punctuation (, ), “, ,, ; ============ ================================================= ================================= -ORCHID corpus uses different set of POS tags. Thus, we make UD POS tags version for ORCHID corpus. +ORCHID corpus uses a different set of POS tags. Thus, we make UD POS tags version for ORCHID corpus. The following table shows the mapping of POS tags from ORCHID to UD: -=============== ======================= -ORCHID POS tags Coresponding UD POS tag -=============== ======================= +=============== ======================== +ORCHID POS tags Corresponding UD POS tag +=============== ======================== NOUN NOUN NCMN NOUN NTTL NOUN @@ -164,47 +164,47 @@ Details about LST20 POS tags are available in [#Prachya_2020]_. The following table shows the mapping of POS tags from LST20 to UD: -+----------------+-------------------------+ -| LST20 POS tags | Coresponding UD POS tag | -+================+=========================+ -| AJ | ADJ | -+----------------+-------------------------+ -| AV | ADV | -+----------------+-------------------------+ -| AX | AUX | -+----------------+-------------------------+ -| CC | CCONJ | -+----------------+-------------------------+ -| CL | NOUN | -+----------------+-------------------------+ -| FX | NOUN | -+----------------+-------------------------+ -| IJ | INTJ | -+----------------+-------------------------+ -| NN | NOUN | -+----------------+-------------------------+ -| NU | NUM | -+----------------+-------------------------+ -| PA | PART | -+----------------+-------------------------+ -| PR | PROPN | -+----------------+-------------------------+ -| PS | ADP | -+----------------+-------------------------+ -| PU | PUNCT | -+----------------+-------------------------+ -| VV | VERB | -+----------------+-------------------------+ -| XX | X | -+----------------+-------------------------+ ++----------------+--------------------------+ +| LST20 POS tags | Corresponding UD POS tag | ++================+==========================+ +| AJ | ADJ | ++----------------+--------------------------+ +| AV | ADV | ++----------------+--------------------------+ +| AX | AUX | ++----------------+--------------------------+ +| CC | CCONJ | ++----------------+--------------------------+ +| CL | NOUN | ++----------------+--------------------------+ +| FX | NOUN | ++----------------+--------------------------+ +| IJ | INTJ | ++----------------+--------------------------+ +| NN | NOUN | ++----------------+--------------------------+ +| NU | NUM | ++----------------+--------------------------+ +| PA | PART | ++----------------+--------------------------+ +| PR | PROPN | ++----------------+--------------------------+ +| PS | ADP | ++----------------+--------------------------+ +| PU | PUNCT | ++----------------+--------------------------+ +| VV | VERB | ++----------------+--------------------------+ +| XX | X | ++----------------+--------------------------+ -For the NE, we use `Inside-outside-beggining (IOB) `_ format to tag NE for each word. +For the NE, we use `Inside-outside-beginning (IOB) `_ format to tag NE for each word. -*B-* prefix indicates the begining token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk. +*B-* prefix indicates the beginning token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk. For instance, given a sentence "บารัค โอบามาเป็นประธานธิปดี", it would tag the tokens "บารัค", "โอบามา", "เป็น", "ประธานาธิปดี" with "B-PERSON", "I-PERSON", "O", and "O" respectively. -The following table shows named entity (NE) tags as used PyThaiNLP: +The following table shows named entity (NE) tags as used in PyThaiNLP: ============================ ================================= Named Entity tag Examples @@ -244,7 +244,7 @@ Tagger Engines perceptron ++++++++++ -Perceptron tagger is the part-of-speech tagging using the averaged, structured perceptron algorithm. +Perceptron tagger is a part-of-speech tagging using the averaged, structured perceptron algorithm. unigram +++++++ diff --git a/docs/api/word_vector.rst b/docs/api/word_vector.rst index 2de638b6e..b9c4b2cd1 100644 --- a/docs/api/word_vector.rst +++ b/docs/api/word_vector.rst @@ -2,7 +2,7 @@ pythainlp.word_vector ==================================== -The :class:`word_vector` contains functions that makes use of a pre-trained vector public data. +The :class:`word_vector` contains functions that make use of a pre-trained vector of public data. Dependencies ------------ diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst index d62691e5b..30656b4ff 100644 --- a/docs/api/wsd.rst +++ b/docs/api/wsd.rst @@ -3,7 +3,7 @@ pythainlp.wsd ============= -The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD). +The :class:`pythainlp.wsd` contains functions used to get word senses for Thai Word Sense Disambiguation (WSD). Modules diff --git a/docs/clean_directory.sh b/docs/clean_directory.sh index 5f7cb555e..a39243d2a 100644 --- a/docs/clean_directory.sh +++ b/docs/clean_directory.sh @@ -5,7 +5,7 @@ # $1 : FTP_USER # $2 : FTP_PASSWORD # $3 : FTP_HOST -# $4 : Brnach name +# $4 : Branch name FTP_USER=$1 FTP_PASSWORD=$2 diff --git a/docs/conf.py b/docs/conf.py index 0459202b1..2755ae110 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,7 +41,7 @@ ) .decode() .strip() - .split("/")[-1] + .rsplit("/", maxsplit=1)[-1] ) release = ( os.environ["RELEASE"] @@ -53,7 +53,7 @@ ) .decode() .strip() - .split("-")[0] + .split("-", maxsplit=1)[0] ) # today = ( # os.environ["TODAY"] diff --git a/docs/index.rst b/docs/index.rst index be660cf04..11f3dc9d3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -.. pythainlp documentation master file, created by +.. PyThaiNLP documentation master file, created by sphinx-quickstart on Sat Jun 23 15:23:30 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. diff --git a/pythainlp/__init__.py b/pythainlp/__init__.py index 7a61fea7d..c48294248 100644 --- a/pythainlp/__init__.py +++ b/pythainlp/__init__.py @@ -48,7 +48,7 @@ thai_digits = "๐๑๒๓๔๕๖๗๘๙" # 10 thai_symbols = "\u0e3f" # Thai Bath ฿ -# All Thai characters that presented in Unicode +# All Thai characters that are presented in Unicode thai_characters = "".join( [thai_letters, thai_punctuations, thai_digits, thai_symbols] ) diff --git a/pythainlp/ancient/__init__.py b/pythainlp/ancient/__init__.py index f33a23409..b2cae98cc 100644 --- a/pythainlp/ancient/__init__.py +++ b/pythainlp/ancient/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Ancient language of Thai language +Ancient versions of the Thai language """ __all__ = ["aksonhan_to_current"] diff --git a/pythainlp/ancient/aksonhan.py b/pythainlp/ancient/aksonhan.py index 4f984014c..e2bd298a9 100644 --- a/pythainlp/ancient/aksonhan.py +++ b/pythainlp/ancient/aksonhan.py @@ -32,17 +32,17 @@ _dict_thai = set(thai_orst_words()) # call Thai words -def aksonhan_to_current(word:str)->str: +def aksonhan_to_current(word: str) -> str: """ - AksonHan words convert to current Thai words + Convert AksonHan words to current Thai words - AksonHan (อักษรหัน) is write down two consonants as the \ + AksonHan (อักษรหัน) writes down two consonants for the \ spelling of the /a/ vowels. (สระ อะ). - Today, รร is an aksonHan words that still used in Thai. + Today, รร is an aksonHan word that is still used in Thai. :param str word: Thai word - :return: Thai AksonHan convert to current Thai words + :return: Thai AksonHan to be converted to current Thai word :rtype: str :Example: diff --git a/pythainlp/augment/lm/fasttext.py b/pythainlp/augment/lm/fasttext.py index f4cc6ea59..2d8438770 100644 --- a/pythainlp/augment/lm/fasttext.py +++ b/pythainlp/augment/lm/fasttext.py @@ -12,16 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools from typing import List, Tuple from gensim.models.fasttext import FastText as FastText_gensim -from pythainlp.tokenize import word_tokenize from gensim.models.keyedvectors import KeyedVectors -import itertools +from pythainlp.tokenize import word_tokenize class FastTextAug: """ - Text Augment from FastText + Text Augment from fastText :param str model_path: path of model file """ @@ -40,18 +40,18 @@ def __init__(self, model_path: str): def tokenize(self, text: str) -> List[str]: """ - Thai text tokenize for fasttext + Thai text tokenization for fastText - :param str text: thai text + :param str text: Thai text - :return: list of word + :return: list of words :rtype: List[str] """ return word_tokenize(text, engine="icu") def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: """ - :param str sent: text sentence + :param str sent: text of sentence :param float p: probability :rtype: List[List[str]] """ @@ -71,14 +71,14 @@ def augment( self, sentence: str, n_sent: int = 1, p: float = 0.7 ) -> List[Tuple[str]]: """ - Text Augment from FastText + Text Augment from fastText - You wants to download thai model + You may want to download the Thai model from https://fasttext.cc/docs/en/crawl-vectors.html. - :param str sentence: thai sentence - :param int n_sent: number sentence - :param float p: Probability of word + :param str sentence: Thai sentence + :param int n_sent: number of sentences + :param float p: probability of word :return: list of synonyms :rtype: List[Tuple[str]] diff --git a/pythainlp/augment/lm/wangchanberta.py b/pythainlp/augment/lm/wangchanberta.py index 39ec04a4b..06b4134a5 100644 --- a/pythainlp/augment/lm/wangchanberta.py +++ b/pythainlp/augment/lm/wangchanberta.py @@ -12,12 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List from transformers import ( CamembertTokenizer, pipeline, ) -import random -from typing import List model_name = "airesearch/wangchanberta-base-att-spm-uncased" @@ -52,9 +51,6 @@ def generate(self, sentence: str, num_replace_tokens: int = 3): num_replace_tokens = len(sent) masked_text = self.input_text for i in range(num_replace_tokens): - replace_token = [ - sent.pop(random.randrange(len(sent))) for _ in range(1) - ][0] masked_text = masked_text + self.MASK_TOKEN self.sent2 += [ str(j["sequence"]).replace(" ", "").replace("", "") diff --git a/pythainlp/augment/word2vec/bpemb_wv.py b/pythainlp/augment/word2vec/bpemb_wv.py index 2b74ccf2a..bba700c3f 100644 --- a/pythainlp/augment/word2vec/bpemb_wv.py +++ b/pythainlp/augment/word2vec/bpemb_wv.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pythainlp.augment.word2vec.core import Word2VecAug from typing import List, Tuple +from pythainlp.augment.word2vec.core import Word2VecAug class BPEmbAug: @@ -33,7 +33,7 @@ def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300): def tokenizer(self, text: str) -> List[str]: """ - :param str text: thai text + :param str text: Thai text :rtype: List[str] """ return self.bpemb_temp.encode(text) @@ -52,9 +52,9 @@ def augment( """ Text Augment using word2vec from BPEmb - :param str sentence: thai sentence - :param int n_sent: number sentence - :param float p: Probability of word + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word :return: list of synonyms :rtype: List[str] diff --git a/pythainlp/augment/word2vec/core.py b/pythainlp/augment/word2vec/core.py index 0e3f04a22..76beff0df 100644 --- a/pythainlp/augment/word2vec/core.py +++ b/pythainlp/augment/word2vec/core.py @@ -21,9 +21,9 @@ def __init__( self, model: str, tokenize: object, type: str = "file" ) -> None: """ - :param str model: path model + :param str model: path of model :param object tokenize: tokenize function - :param str type: moodel type (file, binary) + :param str type: model type (file, binary) """ import gensim.models.keyedvectors as word2vec @@ -40,7 +40,7 @@ def __init__( def modify_sent(self, sent: str, p: float = 0.7) -> List[List[str]]: """ - :param str sent: text sentence + :param str sent: text of sentence :param float p: probability :rtype: List[List[str]] """ @@ -60,8 +60,8 @@ def augment( self, sentence: str, n_sent: int = 1, p: float = 0.7 ) -> List[Tuple[str]]: """ - :param str sentence: text sentence - :param int n_sent: max number for synonyms sentence + :param str sentence: text of sentence + :param int n_sent: maximum number of synonymous sentences :param int p: probability :return: list of synonyms diff --git a/pythainlp/augment/word2vec/ltw2v.py b/pythainlp/augment/word2vec/ltw2v.py index 870d6ebbc..2debdeca1 100644 --- a/pythainlp/augment/word2vec/ltw2v.py +++ b/pythainlp/augment/word2vec/ltw2v.py @@ -12,10 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Tuple from pythainlp.augment.word2vec.core import Word2VecAug from pythainlp.corpus import get_corpus_path from pythainlp.tokenize import word_tokenize -from typing import List, Tuple class LTW2VAug: @@ -32,14 +32,14 @@ def __init__(self): def tokenizer(self, text: str) -> List[str]: """ - :param str text: thai text + :param str text: Thai text :rtype: List[str] """ return word_tokenize(text, engine="newmm") def load_w2v(self): # insert substitute """ - Load ltw2v word2vec model + Load LTW2V's word2vec model """ self.aug = Word2VecAug(self.ltw2v_wv, self.tokenizer, type="binary") @@ -49,11 +49,11 @@ def augment( """ Text Augment using word2vec from Thai2Fit - :param str sentence: thai sentence - :param int n_sent: number sentence - :param float p: Probability of word + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word - :return: list of text augment + :return: list of text augmented :rtype: List[Tuple[str]] :Example: diff --git a/pythainlp/augment/word2vec/thai2fit.py b/pythainlp/augment/word2vec/thai2fit.py index 7aa5979c2..08ef5d89d 100644 --- a/pythainlp/augment/word2vec/thai2fit.py +++ b/pythainlp/augment/word2vec/thai2fit.py @@ -12,10 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Tuple from pythainlp.augment.word2vec.core import Word2VecAug from pythainlp.corpus import get_corpus_path from pythainlp.tokenize import THAI2FIT_TOKENIZER -from typing import List, Tuple class Thai2fitAug: @@ -32,14 +32,14 @@ def __init__(self): def tokenizer(self, text: str) -> List[str]: """ - :param str text: thai text + :param str text: Thai text :rtype: List[str] """ return THAI2FIT_TOKENIZER.word_tokenize(text) def load_w2v(self): """ - Load thai2fit word2vec model + Load Thai2Fit's word2vec model """ self.aug = Word2VecAug(self.thai2fit_wv, self.tokenizer, type="binary") @@ -49,11 +49,11 @@ def augment( """ Text Augment using word2vec from Thai2Fit - :param str sentence: thai sentence - :param int n_sent: number sentence - :param float p: Probability of word + :param str sentence: Thai sentence + :param int n_sent: number of sentence + :param float p: probability of word - :return: list of text augment + :return: list of text augmented :rtype: List[Tuple[str]] :Example: diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py index df72f4432..30e638909 100644 --- a/pythainlp/augment/wordnet.py +++ b/pythainlp/augment/wordnet.py @@ -20,13 +20,13 @@ "postype2wordnet", ] -from pythainlp.corpus import wordnet from collections import OrderedDict -from pythainlp.tokenize import word_tokenize -from pythainlp.tag import pos_tag +import itertools from typing import List from nltk.corpus import wordnet as wn -import itertools +from pythainlp.corpus import wordnet +from pythainlp.tokenize import word_tokenize +from pythainlp.tag import pos_tag orchid = { @@ -112,9 +112,9 @@ def postype2wordnet(pos: str, corpus: str): """ - convert part-of-speech type to wordnet type + Convert part-of-speech type to wordnet type - :param str pos: pos type + :param str pos: POS type :param str corpus: part-of-speech corpus **Options for corpus** @@ -137,11 +137,11 @@ def find_synonyms( self, word: str, pos: str = None, postag_corpus: str = "orchid" ) -> List[str]: """ - Find synonyms from wordnet + Find synonyms using wordnet :param str word: word :param str pos: part-of-speech type - :param str postag_corpus: postag corpus name + :param str postag_corpus: name of POS tag corpus :return: list of synonyms :rtype: List[str] """ @@ -175,11 +175,11 @@ def augment( """ Text Augment using wordnet - :param str sentence: thai sentence - :param object tokenize: function for tokenize word - :param int max_syn_sent: max number for synonyms sentence - :param bool postag: on part-of-speech - :param str postag_corpus: postag corpus name + :param str sentence: Thai sentence + :param object tokenize: function for tokenizing words + :param int max_syn_sent: maximum number of synonymous sentences + :param bool postag: use part-of-speech + :param str postag_corpus: name of POS tag corpus :return: list of synonyms :rtype: List[Tuple[str]] @@ -206,7 +206,7 @@ def augment( self.list_pos = pos_tag(self.list_words, corpus=postag_corpus) for word, pos in self.list_pos: self.temp = self.find_synonyms(word, pos, postag_corpus) - if self.temp == []: + if not self.temp: self.list_synonym.append([word]) else: self.list_synonym.append(self.temp) @@ -214,7 +214,7 @@ def augment( else: for word in self.list_words: self.temp = self.find_synonyms(word) - if self.temp == []: + if not self.temp: self.list_synonym.append([word]) else: self.list_synonym.append(self.temp) diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py index acee19cd4..a9836b262 100644 --- a/pythainlp/benchmarks/word_tokenization.py +++ b/pythainlp/benchmarks/word_tokenization.py @@ -22,7 +22,7 @@ SEPARATOR = "|" -# regex for removing to a space surrounded by separators, i.e. | | +# regex for removing one space surrounded by separators, i.e. | | SURROUNDING_SEPS_RX = re.compile( "{sep}? ?{sep}$".format(sep=re.escape(SEPARATOR)) ) @@ -33,7 +33,7 @@ # regex for removing tags, i.e. , TAG_RX = re.compile(r"<\/?[A-Z]+>") -# regex for tailing separator, i.e. a|dog| -> a|dog +# regex for removing trailing separators, i.e. a|dog| -> a|dog TAILING_SEP_RX = re.compile("{sep}$".format(sep=re.escape(SEPARATOR))) @@ -54,19 +54,19 @@ def _f1(precision: float, recall: float) -> float: def _flatten_result(my_dict: dict, sep: str = ":") -> dict: """ - Flatten two-level dictionary. + Flatten two-dimension dictionary. - Use keys in the first level as a prefix for keys in the two levels. + Use keys in the first dimension as a prefix for keys in the second dimension. For example, my_dict = { "a": { "b": 7 } } flatten(my_dict) { "a:b": 7 } - :param dict my_dict: contains stats dictionary + :param dict my_dict: dictionary containing stats :param str sep: separator between the two keys (default: ":") - :return: a one-level dictionary with key combined + :return: a one-dimension dictionary with keys combined :rtype: dict[str, float | str] """ items = [] @@ -80,12 +80,12 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict: def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame: """ - Performace benchmark of samples. + Performance benchmarking for samples. Please see :meth:`pythainlp.benchmarks.word_tokenization.compute_stats` for - metrics being computed. + the computed metrics. - :param list[str] ref_samples: ground truth samples + :param list[str] ref_samples: ground truth for samples :param list[str] samples: samples that we want to evaluate :return: dataframe with row x col = len(samples) x len(metrics) @@ -127,7 +127,7 @@ def preprocessing(txt: str, remove_space: bool = True) -> str: Clean up text before performing evaluation. :param str text: text to be preprocessed - :param bool remove_space: whether remove white space + :param bool remove_space: whether to remove white space :return: preprocessed text :rtype: str @@ -150,26 +150,26 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: """ Compute statistics for tokenization quality - These statistics includes: + These statistics include: **Character-Level**: True Positive, False Positive, True Negative, False Negative, Precision, Recall, and f1 **Word-Level**: Precision, Recall, and f1 **Other**: - - Correct tokenization indicator: {0, 1} sequence indicating the correspoding + - Correct tokenization indicator: {0, 1} sequence indicating that the corresponding word is tokenized correctly. - :param str ref_sample: ground truth samples + :param str ref_sample: ground truth for samples :param str samples: samples that we want to evaluate - :return: metrics in character and word-level and correctly tokenized word indicators + :return: metrics at character- and word-level and indicators of correctly tokenized words :rtype: dict[str, float | str] """ ref_sample = _binary_representation(ref_sample) sample = _binary_representation(raw_sample) - # Compute charater-level statistics + # Compute character-level statistics c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0) c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]] @@ -181,17 +181,13 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: c_tn = np.sum(ref_sample[c_neg_pred] == 0) c_fn = np.sum(ref_sample[c_neg_pred] == 1) - c_precision = c_tp / (c_tp + c_fp) - c_recall = c_tp / (c_tp + c_fn) - c_f1 = _f1(c_precision, c_recall) - # Compute word-level statistics # Find correctly tokenized words in the reference sample - word_boundaries = _find_word_boudaries(ref_sample) + word_boundaries = _find_word_boundaries(ref_sample) # Find correctly tokenized words in the sample - ss_boundaries = _find_word_boudaries(sample) + ss_boundaries = _find_word_boundaries(sample) tokenization_indicators = _find_words_correctly_tokenised( word_boundaries, ss_boundaries ) @@ -199,7 +195,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: correctly_tokenised_words = np.sum(tokenization_indicators) tokenization_indicators = list( - map(lambda x: str(x), tokenization_indicators) + map(str, tokenization_indicators) ) return { @@ -222,7 +218,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: def _binary_representation(txt: str, verbose: bool = False): """ - Transform text to {0, 1} sequence. + Transform text into {0, 1} sequence. where (1) indicates that the corresponding character is the beginning of a word. For example, ผม|ไม่|ชอบ|กิน|ผัก -> 10100... @@ -253,9 +249,9 @@ def _binary_representation(txt: str, verbose: bool = False): return bin_rept -def _find_word_boudaries(bin_reps) -> list: +def _find_word_boundaries(bin_reps) -> list: """ - Find start and end location of each word. + Find the starting and ending location of each word. :param str bin_reps: binary representation of a text diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index b4b519009..06e0f51aa 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -18,12 +18,12 @@ class ChatBotModel: def __init__(self): """ - Chat with AI generation + Chat using AI generation """ self.history = [] def reset_chat(self): """ - Reset chat by clean history + Reset chat by cleaning history """ self.history = [] def load_model( @@ -65,8 +65,8 @@ def chat(self, text:str)->str: """ Chatbot - :param str text: text for asking chatbot. - :return: the answer from chatbot. + :param str text: text for asking chatbot with. + :return: answer from chatbot. :rtype: str :Example: :: @@ -84,7 +84,7 @@ def chat(self, text:str)->str: # output: [('สวัสดี', 'ยินดีที่ได้รู้จัก')] """ _temp="" - if self.history!=[]: + if self.history: for h,b in self.history: _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":h,"bot":b})+self.model.stop_token _temp+=self.model.PROMPT_DICT['prompt_chatbot'].format_map({"human":text,"bot":""}) diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py index 0d984d3c4..08a60efe3 100644 --- a/pythainlp/cli/__init__.py +++ b/pythainlp/cli/__init__.py @@ -18,7 +18,7 @@ from pythainlp.cli import data, soundex, tag, tokenize, benchmark -# a command should be a verb when possible +# a command should start with a verb when possible COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"]) CLI_NAME = "thainlp" @@ -27,7 +27,7 @@ def make_usage(command: str) -> dict: prog = f"{CLI_NAME} {command}" - return dict(prog=prog, usage=f"{prog} [options]") + return {"prog": prog, "usage": f"{prog} [options]"} def exit_if_empty(command: str, parser: ArgumentParser) -> None: diff --git a/pythainlp/cli/benchmark.py b/pythainlp/cli/benchmark.py index 807525dc2..5a8ed5136 100644 --- a/pythainlp/cli/benchmark.py +++ b/pythainlp/cli/benchmark.py @@ -108,7 +108,7 @@ def __init__(self, name, argv): "word_level:total_words_in_ref_sample", ] - statistics = dict() + statistics = {} for c in columns: statistics[c] = float(df_raw[c].sum()) @@ -170,9 +170,9 @@ def __init__(self, name, argv): del r["actual"] samples.append( - dict(metrics=r, expected=expected, actual=actual, id=i) + {"metrics": r, "expected": expected, "actual": actual, "id": i} ) - details = dict(metrics=statistics, samples=samples) + details = {"metrics": statistics, "samples": samples} json.dump(details, f, ensure_ascii=False) diff --git a/pythainlp/cli/data.py b/pythainlp/cli/data.py index 597c1860d..527038555 100644 --- a/pythainlp/cli/data.py +++ b/pythainlp/cli/data.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -thainlp dataset/corpus management command line. +Command line for PyThaiNLP's dataset/corpus management. """ import argparse -from pythainlp import cli, corpus +from pythainlp import corpus from pythainlp.tools import get_pythainlp_data_path @@ -123,5 +123,5 @@ def catalog(self, argv): ) def path(self, argv): - """Print path for local dataset.""" + """Print path of local dataset.""" print(get_pythainlp_data_path()) diff --git a/pythainlp/cli/soundex.py b/pythainlp/cli/soundex.py index 603cc50c4..605c1c7a6 100644 --- a/pythainlp/cli/soundex.py +++ b/pythainlp/cli/soundex.py @@ -13,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -thainlp soundex command line. +Command line for PyThaiNLP's soundex. -Take input text from command line. +It takes input text from the command line. """ import argparse -from pythainlp import cli from pythainlp.soundex import DEFAULT_SOUNDEX_ENGINE, soundex diff --git a/pythainlp/cli/tag.py b/pythainlp/cli/tag.py index 00cf3314f..a42b46855 100644 --- a/pythainlp/cli/tag.py +++ b/pythainlp/cli/tag.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -thainlp tag command line. +Command line for PyThaiNLP's taggers. """ import argparse from pythainlp import cli -from pythainlp.tag import locations, named_entity, pos_tag +from pythainlp.tag import pos_tag class SubAppBase: diff --git a/pythainlp/cli/tokenize.py b/pythainlp/cli/tokenize.py index b34218585..f6554b55e 100644 --- a/pythainlp/cli/tokenize.py +++ b/pythainlp/cli/tokenize.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -thainlp tokenize command line. +Command line for PyThaiNLP's tokenizers. """ import argparse @@ -22,7 +22,6 @@ from pythainlp.tokenize import ( DEFAULT_SENT_TOKENIZE_ENGINE, DEFAULT_SUBWORD_TOKENIZE_ENGINE, - DEFAULT_SYLLABLE_TOKENIZE_ENGINE, DEFAULT_WORD_TOKENIZE_ENGINE, sent_tokenize, subword_tokenize, diff --git a/pythainlp/cls/param_free.py b/pythainlp/cls/param_free.py index ed4db7f4e..8548b08d3 100644 --- a/pythainlp/cls/param_free.py +++ b/pythainlp/cls/param_free.py @@ -14,13 +14,13 @@ # limitations under the License. import gzip +from typing import List, Tuple import numpy as np -from typing import Dict, List, Tuple, Union class GzipModel: """ - This class is a reimplemenatation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) + This class is a re-implementation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) :param list training_data: list [(text_sample,label)] """ @@ -30,7 +30,7 @@ def __init__(self, training_data: List[Tuple[str, str]]): self.Cx2_list = self.train() def train(self): - Cx2_list = list() + Cx2_list = [] for i in range(len(self.training_data)): Cx2_list.append( len(gzip.compress(self.training_data[i][0].encode("utf-8"))) @@ -39,7 +39,7 @@ def train(self): def predict(self, x1: str, k: int = 1) -> str: """ - :param str x1: the text that want to predict label. + :param str x1: the text that we want to predict label for. :param str k: k :return: label :rtype: str diff --git a/pythainlp/coref/_fastcoref.py b/pythainlp/coref/_fastcoref.py index e5ce90e23..c1d502a0d 100644 --- a/pythainlp/coref/_fastcoref.py +++ b/pythainlp/coref/_fastcoref.py @@ -17,15 +17,15 @@ class FastCoref: - def __init__(self, model_name, nlp=spacy.blank("th"), device:str="cpu", type:str="FCoref") -> None: + def __init__(self, model_name, nlp=spacy.blank("th"), device: str="cpu", type: str="FCoref") -> None: if type == "FCoref": from fastcoref import FCoref as _model else: from fastcoref import LingMessCoref as _model self.model_name = model_name self.nlp = nlp - self.model = _model(self.model_name,device=device,nlp=self.nlp) - + self.model = _model(self.model_name, device=device, nlp=self.nlp) + def _to_json(self, _predict): return { "text":_predict.text, @@ -33,6 +33,6 @@ def _to_json(self, _predict): "clusters":_predict.get_clusters(as_strings=False) } - - def predict(self, texts:List[str])->dict: + + def predict(self, texts: List[str]) -> dict: return [self._to_json(i) for i in self.model.predict(texts=texts)] diff --git a/pythainlp/coref/core.py b/pythainlp/coref/core.py index 77dda5a40..dd1e95d0f 100644 --- a/pythainlp/coref/core.py +++ b/pythainlp/coref/core.py @@ -20,10 +20,10 @@ def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", dev """ Coreference Resolution - :param List[str] texts: list texts to do coreference resolution + :param List[str] texts: list of texts to apply coreference resolution to :param str model_name: coreference resolution model - :param str device: device for running coreference resolution model (cpu, cuda, and other) - :return: List txets of coreference resolution + :param str device: device for running coreference resolution model on (cpu, cuda, and others) + :return: List of texts with coreference resolution :rtype: List[dict] :Options for model_name: @@ -49,7 +49,7 @@ def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", dev global model if isinstance(texts, str): texts = [texts] - if model == None and model_name=="han-coref-v1.0": + if model is None and model_name=="han-coref-v1.0": from pythainlp.coref.han_coref import HanCoref model = HanCoref(device=device) - return model.predict(texts) \ No newline at end of file + return model.predict(texts) diff --git a/pythainlp/coref/han_coref.py b/pythainlp/coref/han_coref.py index 9ae062949..36f0bb642 100644 --- a/pythainlp/coref/han_coref.py +++ b/pythainlp/coref/han_coref.py @@ -12,13 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pythainlp.coref._fastcoref import FastCoref import spacy +from pythainlp.coref._fastcoref import FastCoref class HanCoref(FastCoref): def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None: - super(self.__class__, self).__init__( + super().__init__( model_name="pythainlp/han-coref-v1.0", device=device, nlp=nlp diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index 3ca56e0d0..e54e80c42 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -57,13 +57,13 @@ _CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME) _CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE") -# remote corpus catalog URL +# URL of remote corpus catalog _CORPUS_DB_URL = "https://pythainlp.github.io/pythainlp-corpus/db.json" -# local corpus catalog filename +# filename of local corpus catalog _CORPUS_DB_FILENAME = "db.json" -# local corpus catalog full path +# full path of local corpus catalog _CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME) # create a local corpus database if it does not already exist diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 411148358..2bf7777d5 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Common list of words. +Common lists of words. """ __all__ = [ @@ -39,7 +39,7 @@ _THAI_COUNTRIES_FILENAME = "countries_th.txt" _THAI_THAILAND_PROVINCES = set() -_THAI_THAILAND_PROVINCES_DETAILS = list() +_THAI_THAILAND_PROVINCES_DETAILS = [] _THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv" _THAI_SYLLABLES = set() @@ -75,7 +75,7 @@ def countries() -> FrozenSet[str]: \n(See: `dev/pythainlp/corpus/countries_th.txt\ `_) - :return: :class:`frozenset` containing countries names in Thai + :return: :class:`frozenset` containing country names in Thai :rtype: :class:`frozenset` """ global _THAI_COUNTRIES @@ -105,12 +105,12 @@ def provinces(details: bool = False) -> Union[FrozenSet[str], List[str]]: if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS: provs = set() - prov_details = list() + prov_details = [] for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True): p = line.split(",") - prov = dict() + prov = {} prov["name_th"] = p[0] prov["abbr_th"] = p[1] prov["name_en"] = p[2] @@ -134,9 +134,9 @@ def thai_syllables() -> FrozenSet[str]: "โมน", "โม่ง", "กา", "ก่า", and, "ก้า". \n(See: `dev/pythainlp/corpus/syllables_th.txt\ `_) - We using thai syllables list from `KUCut `_. + We use the Thai syllable list from `KUCut `_. - :return: :class:`frozenset` containing syllables in Thai language. + :return: :class:`frozenset` containing syllables in the Thai language. :rtype: :class:`frozenset` """ global _THAI_SYLLABLES @@ -152,7 +152,7 @@ def thai_words() -> FrozenSet[str]: and "พิษภัย". \n(See: `dev/pythainlp/corpus/words_th.txt\ `_) - :return: :class:`frozenset` containing words in Thai language. + :return: :class:`frozenset` containing words in the Thai language. :rtype: :class:`frozenset` """ global _THAI_WORDS @@ -168,7 +168,7 @@ def thai_orst_words() -> FrozenSet[str]: \n(See: `dev/pythainlp/corpus/thai_orst_words.txt\ `_) - :return: :class:`frozenset` containing words in Thai language. + :return: :class:`frozenset` containing words in the Thai language. :rtype: :class:`frozenset` """ global _THAI_ORST_WORDS @@ -183,7 +183,7 @@ def thai_stopwords() -> FrozenSet[str]: Return a frozenset of Thai stopwords such as "มี", "ไป", "ไง", "ขณะ", "การ", and "ประการหนึ่ง". \n(See: `dev/pythainlp/corpus/stopwords_th.txt\ `_) - We using stopword lists by thesis's เพ็ญศิริ ลี้ตระกูล. + We use stopword lists by thesis's เพ็ญศิริ ลี้ตระกูล. :See Also: @@ -207,7 +207,7 @@ def thai_negations() -> FrozenSet[str]: \n(See: `dev/pythainlp/corpus/negations_th.txt\ `_) - :return: :class:`frozenset` containing negations in Thai language. + :return: :class:`frozenset` containing negations in the Thai language. :rtype: :class:`frozenset` """ global _THAI_NEGATIONS @@ -271,11 +271,11 @@ def thai_dict() -> dict: \n(See: `thai_dict\ `_) - :return: Thai word with part-of-speech type and definition + :return: Thai words with part-of-speech type and definition :rtype: dict """ global _THAI_DICT - if _THAI_DICT == {}: + if not _THAI_DICT: import csv _THAI_DICT = {"word":[], "meaning":[]} with open(get_corpus_path("thai_dict"), newline="\n", encoding="utf-8") as csvfile: @@ -293,11 +293,11 @@ def thai_wsd_dict() -> dict: \n(See: `thai_dict\ `_) - :return: Thai word with part-of-speech type and definition + :return: Thai words with part-of-speech type and definition :rtype: dict """ global _THAI_WSD_DICT - if _THAI_WSD_DICT == {}: + if not _THAI_WSD_DICT: _thai_wsd = thai_dict() _THAI_WSD_DICT = {"word":[],"meaning":[]} for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]): @@ -319,11 +319,11 @@ def thai_synonym() -> dict: \n(See: `thai_synonym\ `_) - :return: Thai word with part-of-speech type and synonym + :return: Thai words with part-of-speech type and synonym :rtype: dict """ global _THAI_SYNONYM - if _THAI_SYNONYM == None: + if _THAI_SYNONYM is None: import csv _THAI_SYNONYM = {"word":[], "pos":[], "synonym":[]} with open(get_corpus_path("thai_synonym"), newline="\n", encoding="utf-8") as csvfile: diff --git a/pythainlp/corpus/conceptnet.py b/pythainlp/corpus/conceptnet.py index 943a9bf43..979bb9683 100644 --- a/pythainlp/corpus/conceptnet.py +++ b/pythainlp/corpus/conceptnet.py @@ -25,8 +25,8 @@ def edges(word: str, lang: str = "th"): understand the meanings of words that people use. For example, the term "ConceptNet" is a "knowledge graph", and - "knowledge graph" has "common sense knowledge" which is a part of - "artificial inteligence". Also, "ConcepNet" is used for + "knowledge graph" has "common sense knowledge" which is a part of + "artificial intelligence". Also, "ConcepNet" is used for "natural language understanding" which is a part of "artificial intelligence". diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 148728ec7..d7479f48f 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -50,8 +50,8 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict: """ Get details about a corpus, using information from local catalog. - :param str name: name corpus - :return: details about a corpus + :param str name: name of corpus + :return: details about corpus :rtype: dict """ with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: @@ -66,7 +66,7 @@ def get_corpus_db_detail(name: str, version: str = None) -> dict: if corpus["name"] == name and corpus["version"] == version: return corpus - return dict() + return {} def path_pythainlp_corpus(filename: str) -> str: @@ -87,7 +87,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: Each line in the file will be a member of the set or the list. - By default, a frozenset will be return, with whitespaces stripped, and + By default, a frozenset will be return, with whitespace stripped and empty values and duplicates removed. If as_is is True, a list will be return, with no modifications @@ -96,7 +96,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: :param str filename: filename of the corpus to be read - :return: :class:`frozenset` or :class:`list` consists of lines in the file + :return: :class:`frozenset` or :class:`list` consisting of lines in the file :rtype: :class:`frozenset` or :class:`list` :Example: @@ -132,12 +132,12 @@ def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]: Get model path from default_db.json :param str name: corpus name - :return: path to the corpus or **None** of the corpus doesn't \ - exist in the device + :return: path to the corpus or **None** if the corpus doesn't \ + exist on the device :rtype: str - If you want edit default_db.json, \ - you can edit in pythainlp/corpus/default_db.json + If you want to edit default_db.json, \ + you can edit pythainlp/corpus/default_db.json """ default_db_path = path_pythainlp_corpus("default_db.json") with open(default_db_path, encoding="utf-8-sig") as fh: @@ -163,14 +163,14 @@ def get_corpus_path( :param str name: corpus name :param str version: version - :param bool force: force download - :return: path to the corpus or **None** of the corpus doesn't \ - exist in the device + :param bool force: force downloading + :return: path to the corpus or **None** if the corpus doesn't \ + exist on the device :rtype: str :Example: - (Please see the filename from + (Please see the filename in `this file `_ @@ -198,18 +198,18 @@ def get_corpus_path( print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ - # Customize your the corpus path then close the line after lines 164 through 190. + # Customize your corpus path then close the line from lines 164 through 190. _CUSTOMIZE = { # "the corpus name":"path" } - if name in list(_CUSTOMIZE.keys()): + if name in list(_CUSTOMIZE): return _CUSTOMIZE[name] default_path = get_corpus_default_db(name=name, version=version) if default_path is not None: return default_path - # check if the corpus is in local catalog, download if not + # check if the corpus is in local catalog, download it if not corpus_db_detail = get_corpus_db_detail(name, version=version) if not corpus_db_detail or not corpus_db_detail.get("filename"): @@ -222,7 +222,7 @@ def get_corpus_path( path = get_full_data_path(corpus_db_detail.get("foldername")) else: path = get_full_data_path(corpus_db_detail.get("filename")) - # check if the corpus file actually exists, download if not + # check if the corpus file actually exists, download it if not if not os.path.exists(path): download(name, version=version, force=force) if os.path.exists(path): @@ -235,8 +235,8 @@ def _download(url: str, dst: str) -> int: """ Download helper. - @param: url to download file - @param: dst place to put the file + @param: URL for downloading file + @param: dst place to put the file into """ _CHUNK_SIZE = 64 * 1024 # 64 KiB @@ -270,8 +270,8 @@ def _check_hash(dst: str, md5: str) -> None: """ Check hash helper. - @param: dst place to put the file - @param: md5 place to hash the file (MD5) + @param: dst place to put the file into + @param: md5 place to file hash (MD5) """ if md5 and md5 != "-": import hashlib @@ -312,9 +312,9 @@ def _check_version(cause: str) -> bool: check = False __version = __version__ if "dev" in __version: - __version = __version.split("dev")[0] + __version = __version.split("dev", maxsplit=1)[0] elif "beta" in __version: - __version = __version.split("beta")[0] + __version = __version.split("beta", maxsplit=1)[0] v = _version2int(__version) if cause == "*": @@ -330,13 +330,13 @@ def _check_version(cause: str) -> bool: check = v > _version2int(temp) elif cause.startswith(">=") and "<=" not in cause and "<" in cause: temp = cause.replace(">=", "").split("<") - check = v >= _version2int(temp[0]) and v < _version2int(temp[1]) + check = _version2int(temp[0]) <= v < _version2int(temp[1]) elif cause.startswith(">=") and "<=" in cause: temp = cause.replace(">=", "").split("<=") - check = v >= _version2int(temp[0]) and v <= _version2int(temp[1]) + check = _version2int(temp[0]) <= v <= _version2int(temp[1]) elif cause.startswith(">") and "<" in cause: temp = cause.replace(">", "").split("<") - check = v > _version2int(temp[0]) and v < _version2int(temp[1]) + check = _version2int(temp[0]) < v < _version2int(temp[1]) elif cause.startswith("<="): temp = cause.replace("<=", "") check = v <= _version2int(temp[0]) @@ -357,10 +357,10 @@ def download( https://pythainlp.github.io/pythainlp-corpus/db.json :param str name: corpus name - :param bool force: force download + :param bool force: force downloading :param str url: URL of the corpus catalog - :param str version: Version of the corpus - :return: **True** if the corpus is found and succesfully downloaded. + :param str version: version of the corpus + :return: **True** if the corpus is found and successfully downloaded. Otherwise, it returns **False**. :rtype: bool @@ -375,7 +375,7 @@ def download( # - Downloading: wiki_lm_lstm 0.1 # thwiki_lm.pth: 26%|██▌ | 114k/434k [00:00<00:00, 690kB/s] - By default, downloaded corpus and model will be saved in + By default, downloaded corpora and models will be saved in ``$HOME/pythainlp-data/`` (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``). """ @@ -424,7 +424,7 @@ def download( found = i break - # If not found in local, download + # If not found in local, download it if force or not found: print(f"- Downloading: {name} {version}") _download( @@ -484,16 +484,16 @@ def download( with open(corpus_db_path(), "w", encoding="utf-8") as f: json.dump(local_db, f, ensure_ascii=False) - # Check if versions match if the corpus is found in local database + # Check if versions match or if the corpus is found in local database # but a re-download is not forced else: current_ver = local_db["_default"][found]["version"] if current_ver == version: - # Already has the same version + # Corpus of the same version already exists print("- Already up to date.") else: - # Has the corpus but different version + # Corpus exists but is of different version print(f"- Existing version: {current_ver}") print(f"- New version available: {version}") print("- Use download(data_name, force=True) to update") @@ -509,7 +509,7 @@ def remove(name: str) -> bool: Remove corpus :param str name: corpus name - :return: **True** if the corpus is found and succesfully removed. + :return: **True** if the corpus is found and successfully removed. Otherwise, it returns **False**. :rtype: bool diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md index b5a28cdc1..5137bb9a2 100644 --- a/pythainlp/corpus/corpus_license.md +++ b/pythainlp/corpus/corpus_license.md @@ -6,7 +6,7 @@ ## Dictionaries and Word Lists -The following word lists are created by PyThaiNLP project and released under +The following word lists are created by the PyThaiNLP project and released under **Creative Commons Zero 1.0 Universal Public Domain Dedication License** https://creativecommons.org/publicdomain/zero/1.0/ @@ -25,7 +25,7 @@ words_th_thai2fit_201810.txt | List of Thai words (frozen for thai2fit) The following word lists are from **Thai Male and Female Names Corpus** https://github.com/korkeatw/thai-names-corpus/ by Korkeat Wannapat -and released under their the original license which is +and released under their original licenses which are **Creative Commons Attribution-ShareAlike 4.0 International Public License** https://creativecommons.org/licenses/by-sa/4.0/ @@ -38,7 +38,7 @@ person_names_male_th.txt | List of male names in Thailand ## Models -The following language models are created by PyThaiNLP project +The following language models are created by the PyThaiNLP project and released under **Creative Commons Attribution 4.0 International Public License** https://creativecommons.org/licenses/by/4.0/ diff --git a/pythainlp/corpus/oscar.py b/pythainlp/corpus/oscar.py index ac4d85962..5e308ebd2 100644 --- a/pythainlp/corpus/oscar.py +++ b/pythainlp/corpus/oscar.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Thai unigram word frequency from OSCAR Corpus (icu word tokenize) +Thai unigram word frequency from OSCAR Corpus (words tokenized using ICU) Credit: Korakot Chaovavanich https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/ @@ -31,12 +31,12 @@ def word_freqs() -> List[Tuple[str, int]]: """ - Get word frequency from OSCAR Corpus (icu word tokenize) + Get word frequency from OSCAR Corpus (words tokenized using ICU) """ word_freqs = [] _path = get_corpus_path(_FILENAME) with open(_path, "r", encoding="utf-8-sig") as f: - _data = [i for i in f.readlines()] + _data = list(f.readlines()) del _data[0] for line in _data: _temp = line.strip().split(",") @@ -51,12 +51,12 @@ def word_freqs() -> List[Tuple[str, int]]: def unigram_word_freqs() -> defaultdict: """ - Get unigram word frequency from OSCAR Corpus (icu word tokenize) + Get unigram word frequency from OSCAR Corpus (words tokenized using ICU) """ _path = get_corpus_path(_FILENAME) _word_freqs = defaultdict(int) with open(_path, "r", encoding="utf-8-sig") as fh: - _data = [i for i in fh.readlines()] + _data = list(fh.readlines()) del _data[0] for i in _data: _temp = i.strip().split(",") diff --git a/pythainlp/corpus/th_en_translit.py b/pythainlp/corpus/th_en_translit.py index 7ce306794..1dfd66b4f 100644 --- a/pythainlp/corpus/th_en_translit.py +++ b/pythainlp/corpus/th_en_translit.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Thai-English Transliteratation Dictionary v1.4 +Thai-English Transliteration Dictionary v1.4 Wannaphong Phatthiyaphaibun. (2022). wannaphong/thai-english-transliteration-dictionary: v1.4 (v1.4). @@ -37,9 +37,9 @@ def get_transliteration_dict() -> defaultdict: """ - Get transliteration dictionary for Thai to English. + Get Thai to English transliteration dictionary. - The returned dict is defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format. + The returned dict is in defaultdict[str, defaultdict[List[str], List[Optional[bool]]]] format. """ path = path_pythainlp_corpus(_FILE_NAME) if not path: @@ -48,18 +48,18 @@ def get_transliteration_dict() -> defaultdict: f"{_FILE_NAME} is not found under pythainlp/corpus." ) - # use list, one word can have multiple transliterations. + # use list, as one word can have multiple transliterations. trans_dict = defaultdict( lambda: {TRANSLITERATE_EN: [], TRANSLITERATE_FOLLOW_RTSG: []} ) try: with open(path, "r", encoding="utf-8") as f: - # assume first row contains column names, skipped. + # assume that the first row contains column names, so skip it. for line in f.readlines()[1:]: stripped = line.strip() if stripped: th, *en_checked = stripped.split("\t") - # replace in-between whitespaces to prevent mismatch results from different tokenizers. + # replace in-between whitespace to prevent mismatched results from different tokenizers. # e.g. "บอยแบนด์" # route 1: "บอยแบนด์" -> ["บอย", "แบนด์"] -> ["boy", "band"] -> "boyband" # route 2: "บอยแบนด์" -> [""บอยแบนด์""] -> ["boy band"] -> "boy band" diff --git a/pythainlp/corpus/util.py b/pythainlp/corpus/util.py index 75a917db5..f5190010b 100644 --- a/pythainlp/corpus/util.py +++ b/pythainlp/corpus/util.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Tool for create word list -code is from Korakot Chaovavanich. +Tool for creating word lists +codes are from Korakot Chaovavanich. :See also: * `Facebook post \ @@ -33,7 +33,7 @@ def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]: """ - Return begining and ending index pairs of words + Return beginning and ending indexes of word pairs """ i = 0 for w in words: @@ -52,7 +52,7 @@ def find_badwords( :param Callable[[str], List[str]] tokenize: a tokenize function :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set - :return: words that considered making `tokenize` perform unwell + :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] """ right = Counter() @@ -68,7 +68,7 @@ def find_badwords( else: wrong[w] += 1 - # if wrong more than right, then it's a bad word + # if wrong is more than right, then it's a bad word bad_words = [] for w, count in wrong.items(): if count > right[w]: @@ -83,10 +83,10 @@ def revise_wordset( training_data: Iterable[Iterable[str]], ) -> Set[str]: """ - Revise a set of word that could improve tokenization performance of + Revise a set of words that could improve tokenization performance of a dictionary-based `tokenize` function. - `orign_words` will be used as a base set for the dictionary. + `orig_words` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. @@ -96,7 +96,7 @@ def revise_wordset( will be used as a base for revision :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set - :return: words that considered making `tokenize` perform unwell + :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] :Example:: @@ -141,7 +141,7 @@ def revise_newmm_default_wordset( :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set - :return: words that considered making `tokenize` perform unwell + :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] """ orig_words = thai_words() diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py index 07a470d0e..4fe6a0e32 100644 --- a/pythainlp/corpus/wordnet.py +++ b/pythainlp/corpus/wordnet.py @@ -16,7 +16,7 @@ NLTK WordNet wrapper API here is exactly the same as NLTK WordNet API, -except that lang (language) argument will be "tha" (Thai) by default. +except that the lang (language) argument is "tha" (Thai) by default. For more on usage, see NLTK Howto: https://www.nltk.org/howto/wordnet.html @@ -38,17 +38,17 @@ def synsets(word: str, pos: str = None, lang: str = "tha"): """ - This function return the synonym sets for all lemmas given the word + This function returns the synonym set for all lemmas of the given word with an optional argument to constrain the part of speech of the word. - :param str word: word to find its synsets - :param str pos: the part of speech constraint (i.e. *n* for Noun, *v* + :param str word: word to find synsets of + :param str pos: constraint of the part of speech (i.e. *n* for Noun, *v* for Verb, *a* for Adjective, *s* for Adjective satellites, and *r* for Adverb) :param str lang: abbreviation of language (i.e. *eng*, *tha*). By default, it is *tha* - :return: :class:`Synset` for all lemmas for the word constrained with + :return: :class:`Synset` all lemmas of the word constrained with the argument *pos*. :rtype: list[:class:`Synset`] @@ -66,11 +66,11 @@ def synsets(word: str, pos: str = None, lang: str = "tha"): Synset('base.n.14'), Synset('home.n.01'), Synset('houseful.n.01'), Synset('home.n.07')] - When specifying the part of speech constrain. For example, - the word "แรง" cound be interpreted as force (n.) or hard (adj.). + When specifying the constraint of the part of speech. For example, + the word "แรง" could be interpreted as force (n.) or hard (adj.). >>> from pythainlp.corpus.wordnet import synsets - >>> # By default, accept all part of speech + >>> # By default, allow all parts of speech >>> synsets("แรง", lang="tha") >>> >>> # only Noun @@ -86,10 +86,10 @@ def synsets(word: str, pos: str = None, lang: str = "tha"): def synset(name_synsets): """ - This function return the synonym set (synset) given the name of synset + This function returns the synonym set (synset) given the name of the synset (i.e. 'dog.n.01', 'chase.v.01'). - :param str name_synsets: name of the sysset + :param str name_synsets: name of the synset :return: :class:`Synset` of the given name :rtype: :class:`Synset` @@ -111,18 +111,18 @@ def synset(name_synsets): def all_lemma_names(pos: str = None, lang: str = "tha"): """ - This function returns all lemma names for all synsets for the given + This function returns all lemma names for all synsets of the given part of speech tag and language. If part of speech tag is not - specified, all synsets for all part of speech will be used. + specified, all synsets of all parts of speech will be used. - :param str pos: the part of speech constraint (i.e. *n* for Noun, + :param str pos: constraint of the part of speech (i.e. *n* for Noun, *v* for Verb, *a* for Adjective, *s* for Adjective satellites, and *r* for Adverb). By default, *pos* is **None**. :param str lang: abbreviation of language (i.e. *eng*, *tha*). By default, it is *tha*. - :return: :class:`Synset` of lemmas names given the pos and language + :return: :class:`Synset` of lemmas names given the POS and language :rtype: list[:class:`Synset`] :Example: @@ -154,12 +154,12 @@ def all_lemma_names(pos: str = None, lang: str = "tha"): def all_synsets(pos: str = None): """ - This function iterates over all synsets constrained by given + This function iterates over all synsets constrained by the given part of speech tag. :param str pos: part of speech tag - :return: list of synsets constrained by given part of speech tag. + :return: list of synsets constrained by the given part of speech tag. :rtype: Iterable[:class:`Synset`] :Example: @@ -185,7 +185,7 @@ def all_synsets(pos: str = None): def langs(): """ - This function return a set of ISO-639 language codes. + This function returns a set of ISO-639 language codes. :return: ISO-639 language codes :rtype: list[str] @@ -207,15 +207,15 @@ def lemmas(word: str, pos: str = None, lang: str = "tha"): This function returns all lemmas given the word with an optional argument to constrain the part of speech of the word. - :param str word: word to find its lammas - :param str pos: the part of speech constraint (i.e. *n* for Noun, + :param str word: word to find lemmas of + :param str pos: constraint of the part of speech (i.e. *n* for Noun, *v* for Verb, *a* for Adjective, *s* for Adjective satellites, and *r* for Adverb) :param str lang: abbreviation of language (i.e. *eng*, *tha*). By default, it is *tha*. - :return: :class:`Synset` for all lemmas for the word constraine - with the argument *pos*. + :return: :class:`Synset` of all lemmas of the word constrained + by the argument *pos*. :rtype: list[:class:`Lemma`] :Example: @@ -229,7 +229,7 @@ def lemmas(word: str, pos: str = None, lang: str = "tha"): [Lemma('god.n.01.พระเจ้า'), Lemma('godhead.n.01.พระเจ้า'), Lemma('father.n.06.พระเจ้า'), Lemma('god.n.03.พระเจ้า')] - When specify the part of speech tag. + When the part of speech tag is specified: >>> from pythainlp.corpus.wordnet import lemmas >>> @@ -239,7 +239,7 @@ def lemmas(word: str, pos: str = None, lang: str = "tha"): Lemma('roll_up.v.01.ม้วน'), Lemma('wind.v.03.ม้วน'), Lemma('roll.n.11.ม้วน')] >>> - >>> # only lammas with Noun as the part of speech + >>> # only lemmas with Noun as the part of speech >>> lemmas("ม้วน", pos="n") [Lemma('roll.n.11.ม้วน')] """ @@ -248,7 +248,7 @@ def lemmas(word: str, pos: str = None, lang: str = "tha"): def lemma(name_synsets): """ - This function return lemma object given the name. + This function returns lemma object given the name. .. note:: Support only English language (*eng*). @@ -277,8 +277,8 @@ def lemma(name_synsets): def lemma_from_key(key): """ This function returns lemma object given the lemma key. - This is similar to :func:`lemma` but it needs to supply the key - of lemma instead of the name. + This is similar to :func:`lemma` but it needs to be given the key + of lemma instead of the name of lemma. .. note:: Support only English language (*eng*). @@ -304,7 +304,7 @@ def lemma_from_key(key): def path_similarity(synsets1, synsets2): """ This function returns similarity between two synsets based on the - shortest path distance from the equation as follows. + shortest path distance calculated using the equation below. .. math:: @@ -312,13 +312,13 @@ def path_similarity(synsets1, synsets2): synsets2) + 1} The shortest path distance is calculated by the connection through - the is-a (hypernym/hyponym) taxonomy. The score is in the ranage + the is-a (hypernym/hyponym) taxonomy. The score is in the range of 0 to 1. Path similarity of 1 indicates identicality. :param `Synset` synsets1: first synset supplied to measures - the path similarity + the path similarity with :param `Synset` synsets2: second synset supplied to measures - the path similarity + the path similarity with :return: path similarity between two synsets :rtype: float @@ -386,9 +386,9 @@ def wup_similarity(synsets1, synsets2): Least Common Subsumer (most specific ancestor node). :param `Synset` synsets1: first synset supplied to measures - the WUP similarity + the WUP similarity with :param `Synset` synsets2: second synset supplied to measures - the WUP similarity + the WUP similarity with :return: WUP similarity between two synsets :rtype: float @@ -416,7 +416,7 @@ def morphy(form, pos: str = None): This function finds a possible base form for the given form, with the given part of speech. - :param str form: the form to finds the base form + :param str form: the form to finds the base form of :param str pos: part of speech tag of words to be searched :return: base form of the given form diff --git a/pythainlp/el/_multiel.py b/pythainlp/el/_multiel.py index 216f9f6b9..c48753dd5 100644 --- a/pythainlp/el/_multiel.py +++ b/pythainlp/el/_multiel.py @@ -30,4 +30,4 @@ def load_model(self): def process_batch(self, list_text): if isinstance(list_text, str): list_text = [list_text] - return self._bela_run.process_batch(list_text) \ No newline at end of file + return self._bela_run.process_batch(list_text) diff --git a/pythainlp/el/core.py b/pythainlp/el/core.py index 6accbff63..d2f0a5acf 100644 --- a/pythainlp/el/core.py +++ b/pythainlp/el/core.py @@ -21,7 +21,7 @@ def __init__(self, model_name:str="bela", device:str="cuda", tag:str="wikidata") EntityLinker :param str model_name: model name (bela) - :param str device: device for running model + :param str device: device for running model on :param str tag: Entity linking tag (wikidata) You can read about bela model at `https://github.com/PyThaiNLP/MultiEL \ @@ -40,7 +40,7 @@ def get_el(self, list_text:Union[List[str], str])->Union[List[dict], str]: """ Get Entity Linking from Thai Text - :param str Union[List[str], str]: list thai text or text + :param str Union[List[str], str]: list of Thai text or text :return: list of entity linking :rtype: Union[List[dict], str] diff --git a/pythainlp/generate/__init__.py b/pythainlp/generate/__init__.py index 55e95f3f9..851391733 100644 --- a/pythainlp/generate/__init__.py +++ b/pythainlp/generate/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Thai Text generate +Thai Text Generation """ __all__ = ["Unigram", "Bigram", "Trigram"] diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py index 49331b21b..d0aec3988 100644 --- a/pythainlp/generate/core.py +++ b/pythainlp/generate/core.py @@ -15,10 +15,11 @@ """ Text generator using n-gram language model -code from +codes are from https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058 """ import random +from typing import List, Union from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram @@ -26,7 +27,6 @@ from pythainlp.corpus.oscar import ( unigram_word_freqs as oscar_word_freqs_unigram, ) -from typing import List, Union class Unigram: @@ -62,12 +62,12 @@ def gen_sentence( duplicate: bool = False, ) -> Union[List[str], str]: """ - :param str start_seq: word for begin word. - :param int N: number of word. - :param bool output_str: output is str - :param bool duplicate: duplicate word in sent + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence - :return: list words or str words + :return: list of words or a word string :rtype: List[str], str :Example: @@ -78,7 +78,7 @@ def gen_sentence( gen = Unigram() gen.gen_sentence("แมว") - # ouput: 'แมวเวลานะนั้น' + # output: 'แมวเวลานะนั้น' """ if start_seq is None: start_seq = random.choice(self.word) @@ -105,7 +105,7 @@ def _next_word( self._word_list = list(self._word_prob.keys()) if N > len(self._word_list): N = len(self._word_list) - for i in range(N): + for _ in range(N): self._word = random.choice(self._word_list) if duplicate is False: while self._word in self.words: @@ -135,7 +135,7 @@ def __init__(self, name: str = "tnc"): def prob(self, t1: str, t2: str) -> float: """ - probability word + probability of word :param int t1: text 1 :param int t2: text 2 @@ -158,12 +158,12 @@ def gen_sentence( duplicate: bool = False, ) -> Union[List[str], str]: """ - :param str start_seq: word for begin word. - :param int N: number of word. - :param bool output_str: output is str - :param bool duplicate: duplicate word in sent + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence - :return: list words or str words + :return: list of words or a word string :rtype: List[str], str :Example: @@ -174,7 +174,7 @@ def gen_sentence( gen = Bigram() gen.gen_sentence("แมว") - # ouput: 'แมวไม่ได้รับเชื้อมัน' + # output: 'แมวไม่ได้รับเชื้อมัน' """ if start_seq is None: start_seq = random.choice(self.words) @@ -182,7 +182,7 @@ def gen_sentence( self.list_word = [] self.list_word.append(start_seq) - for i in range(N): + for _ in range(N): if duplicate: self._temp = [ j for j in self.bi_keys if j[0] == self.late_word @@ -228,7 +228,7 @@ def __init__(self, name: str = "tnc"): def prob(self, t1: str, t2: str, t3: str) -> float: """ - probability word + probability of word :param int t1: text 1 :param int t2: text 2 @@ -253,12 +253,12 @@ def gen_sentence( duplicate: bool = False, ) -> Union[List[str], str]: """ - :param str start_seq: word for begin word. - :param int N: number of word. - :param bool output_str: output is str - :param bool duplicate: duplicate word in sent + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence - :return: list words or str words + :return: list of words or a word string :rtype: List[str], str :Example: @@ -269,7 +269,7 @@ def gen_sentence( gen = Trigram() gen.gen_sentence() - # ouput: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' + # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' """ if start_seq is None: start_seq = random.choice(self.bi_keys) diff --git a/pythainlp/generate/thai2fit.py b/pythainlp/generate/thai2fit.py index 0b65dbe79..863e3330d 100644 --- a/pythainlp/generate/thai2fit.py +++ b/pythainlp/generate/thai2fit.py @@ -15,15 +15,15 @@ """ Thai2fit: Thai Wikipeida Language Model for Text Generation -Code from +Codes are from https://github.com/PyThaiNLP/tutorials/blob/master/source/notebooks/text_generation.ipynb """ __all__ = ["gen_sentence"] -import pandas as pd import random import pickle from typing import List, Union +import pandas as pd # fastai import fastai @@ -63,21 +63,21 @@ data_lm.sanity_check() -config = dict( - emb_sz=400, - n_hid=1550, - n_layers=4, - pad_token=1, - qrnn=False, - tie_weights=True, - out_bias=True, - output_p=0.25, - hidden_p=0.1, - input_p=0.2, - embed_p=0.02, - weight_p=0.15, -) -trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) +config = { + "emb_sz": 400, + "n_hid": 1550, + "n_layers": 4, + "pad_token": 1, + "qrnn": False, + "tie_weights": True, + "out_bias": True, + "output_p": 0.25, + "hidden_p": 0.1, + "input_p": 0.2, + "embed_p": 0.02, + "weight_p": 0.15, +} +trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1} learn = language_model_learner( data_lm, AWD_LSTM, config=config, pretrained=False, **trn_args @@ -96,10 +96,10 @@ def gen_sentence( """ Text generator using Thai2fit - :param str start_seq: word for begin word. - :param int N: number of word. - :param bool output_str: output is str - :param bool duplicate: duplicate word in sent + :param str start_seq: word to begin sentence with + :param int N: number of words + :param bool output_str: output as string + :param bool duplicate: allow duplicate words in sentence :return: list words or str words :rtype: List[str], str diff --git a/pythainlp/generate/wangchanglm.py b/pythainlp/generate/wangchanglm.py index 851e5e4fa..be06e5552 100644 --- a/pythainlp/generate/wangchanglm.py +++ b/pythainlp/generate/wangchanglm.py @@ -46,8 +46,8 @@ def load_model( """ Load model - :param str model_path: Model path - :param bool return_dict: return_dict + :param str model_path: model path + :param bool return_dict: return dict :param bool load_in_8bit: load model in 8bit :param str device: device (cpu, cuda or other) :param torch_dtype torch_dtype: torch_dtype @@ -71,7 +71,7 @@ def load_model( self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) self.df = pd.DataFrame(self.tokenizer.vocab.items(), columns=['text', 'idx']) self.df['is_exclude'] = self.df.text.map(self.is_exclude) - self.exclude_ids = self.df[self.df.is_exclude==True].idx.tolist() + self.exclude_ids = self.df[self.df.is_exclude is True].idx.tolist() def gen_instruct( self, text:str, @@ -88,15 +88,15 @@ def gen_instruct( Generate Instruct :param str text: text - :param int max_new_tokens: max new tokens - :param float top_p: Top p + :param int max_new_tokens: maximum number of new tokens + :param float top_p: top p :param float temperature: temperature - :param int top_k: Top k - :param int no_repeat_ngram_size: no repeat ngram size + :param int top_k: top k + :param int no_repeat_ngram_size: do not repeat ngram size :param float typical_p: typical p :param bool thai_only: Thai only :param bool skip_special_tokens: skip special tokens - :return: the answer from Instruct. + :return: the answer from Instruct :rtype: str """ batch = self.tokenizer(text, return_tensors="pt") @@ -143,15 +143,15 @@ def instruct_generate( :param str instruct: Instruct :param str context: context - :param int max_new_tokens: max new tokens - :param float top_p: Top p + :param int max_new_tokens: maximum number of new tokens + :param float top_p: top p :param float temperature: temperature - :param int top_k: Top k - :param int no_repeat_ngram_size: no repeat ngram size + :param int top_k: top k + :param int no_repeat_ngram_size: do not repeat ngram size :param float typical_p: typical p :param bool thai_only: Thai only :param bool skip_special_tokens: skip special tokens - :return: the answer from Instruct. + :return: the answer from Instruct :rtype: str :Example: @@ -174,7 +174,7 @@ def instruct_generate( # และเครื่องดื่มแอลกอฮอล์ """ - if context == None or context=="": + if context in (None, ""): prompt = self.PROMPT_DICT['prompt_no_input'].format_map( {'instruction': instruct, 'input': ''} ) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 536e1f1c4..26146d102 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -21,14 +21,13 @@ def __init__(self): """ KhaveeVerifier: Thai Poetry verifier """ - pass def check_sara(self, word: str)-> str: """ Check the vowels in the Thai word. :param str word: Thai word - :return: name vowel of the word. + :return: vowel name of the word :rtype: str :Example: @@ -43,12 +42,12 @@ def check_sara(self, word: str)-> str: """ sara = [] countoa = 0 - # In case การันย์ + # In case of การันย์ if '์' in word[-1]: word = word[:-2] - # In case สระเดี่ยว + # In case of สระเดี่ยว for i in word: - if i == 'ะ' or i == 'ั': + if i in ('ะ', 'ั'): sara.append('อะ') elif i == 'ิ': sara.append('อิ') @@ -67,7 +66,7 @@ def check_sara(self, word: str)-> str: elif i == 'แ': sara.append('แอ') elif i == 'า': - sara.append('อา') + sara.append('อา') elif i == 'โ': sara.append('โอ') elif i == 'ำ': @@ -77,8 +76,8 @@ def check_sara(self, word: str)-> str: sara.append('ออ') elif i == 'ั' and 'ว' in word: sara.append('อัว') - elif i == 'ไ' or i == 'ใ': - sara.append('ไอ') + elif i in ('ไ', 'ใ'): + sara.append('ไอ') elif i == '็': sara.append('ออ') elif 'รร' in word: @@ -86,10 +85,10 @@ def check_sara(self, word: str)-> str: sara.append('อำ') else: sara.append('อะ') - # Incase ออ + # In case of ออ if countoa == 1 and 'อ' in word[-1] and 'เ' not in word: sara.remove('ออ') - # In case เอ เอ + # In case of เอ เอ countA = 0 for i in sara: if i == 'เอ': @@ -98,7 +97,7 @@ def check_sara(self, word: str)-> str: sara.remove('เอ') sara.remove('เอ') sara.append('แ') - # In case สระประสม + # In case of สระประสม if 'เอ' in sara and 'อะ' in sara: sara.remove('เอ') sara.remove('อะ') @@ -114,42 +113,42 @@ def check_sara(self, word: str)-> str: elif 'เอ' in sara and 'อิ' in sara: sara.remove('เอ') sara.remove('อิ') - sara.append('เออ') + sara.append('เออ') elif 'เอ' in sara and 'ออ' in sara and 'อ' in word[-1]: sara.remove('เอ') sara.remove('ออ') sara.append('เออ') - elif 'โอ' in sara and 'อะ' in sara: + elif 'โอ' in sara and 'อะ' in sara: sara.remove('โอ') sara.remove('อะ') sara.append('โอะ') - elif 'เอ' in sara and 'อี' in sara: + elif 'เอ' in sara and 'อี' in sara: sara.remove('เอ') sara.remove('อี') sara.append('เอีย') - elif 'เอ' in sara and 'อือ' in sara: + elif 'เอ' in sara and 'อือ' in sara: sara.remove('เอ') sara.remove('อือ') - sara.append('อัว') - elif 'เอ' in sara and 'อา' in sara: + sara.append('อัว') + elif 'เอ' in sara and 'อา' in sara: sara.remove('เอ') sara.remove('อา') - sara.append('เอา') + sara.append('เอา') elif 'เ' in word and 'า' in word and 'ะ' in word: sara = [] sara.append('เอาะ') - if 'อือ' in sara and 'เออ' in sara: + if 'อือ' in sara and 'เออ' in sara: sara.remove('เออ') sara.remove('อือ') - sara.append('เอือ') + sara.append('เอือ') elif 'ออ' in sara and len(sara) > 1: - sara.remove('ออ') + sara.remove('ออ') elif 'ว' in word and len(sara) == 0: sara.append('อัว') if 'ั' in word and self.check_marttra(word) == 'กา': sara = [] sara.append('ไอ') - # In case อ + # In case of อ if word == 'เออะ': sara = [] sara.append('เออะ') @@ -170,20 +169,20 @@ def check_sara(self, word: str)-> str: sara.append('เอาะ') if 'ฤา' in word or 'ฦา' in word: sara = [] - sara.append('อือ') + sara.append('อือ') elif 'ฤ' in word or 'ฦ' in word: sara = [] - sara.append('อึ') - # In case กน - if sara == [] and len(word) == 2: + sara.append('อึ') + # In case of กน + if not sara and len(word) == 2: if word[-1] != 'ร': sara.append('โอะ') else: - sara.append('ออ') - elif sara == [] and len(word) == 3: - sara.append('ออ') - - # incase บ่ + sara.append('ออ') + elif not sara and len(word) == 3: + sara.append('ออ') + + # In case of บ่ if 'บ่' == word: sara = [] sara.append('ออ') @@ -193,8 +192,8 @@ def check_sara(self, word: str)-> str: if 'เ' in word and 'ื' in word and 'อ' in word: sara = [] sara.append('เอือ') - if sara == []: - return 'Cant find Sara in this word' + if not sara: + return 'Can\'t find Sara in this word' else: return sara[0] @@ -204,7 +203,7 @@ def check_marttra(self, word: str) -> str: Check the Thai spelling Section in the Thai word. :param str word: Thai word - :return: name spelling Section of the word. + :return: name of spelling Section of the word. :rtype: str :Example: @@ -245,11 +244,11 @@ def check_marttra(self, word: str) -> str: elif word[-1] in ['บ', 'ป', 'พ', 'ฟ', 'ภ']: return 'กบ' else: - if '็' in word: - return 'กา' - else: - return 'Cant find Marttra in this word' - + if '็' in word: + return 'กา' + else: + return 'Cant find Marttra in this word' + def is_sumpus(self, word1: str,word2: str) -> bool: """ @@ -257,7 +256,7 @@ def is_sumpus(self, word1: str,word2: str) -> bool: :param str word1: Thai word :param str word2: Thai word - :return: boolen + :return: boolean :rtype: bool :Example: @@ -289,24 +288,21 @@ def is_sumpus(self, word1: str,word2: str) -> bool: elif sara2 == 'อำ' and marttra2 == 'กม': sara2 = 'อำ' marttra2 = 'กา' - if marttra1 == marttra2 and sara1 == sara2: - return True - else: - return False - + return bool(marttra1 == marttra2 and sara1 == sara2) + def check_karu_lahu(self,text): if (self.check_marttra(text) != 'กา' or (self.check_marttra(text) == 'กา' and self.check_sara(text) in ['อา','อี', 'อือ', 'อู', 'เอ', 'แอ', 'โอ', 'ออ', 'เออ', 'เอีย', 'เอือ' ,'อัว']) or self.check_sara(text) in ['อำ','ไอ','เอา']) and text not in ['บ่','ณ','ธ','ก็']: return 'karu' else: return 'lahu' - + def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: """ Check the suitability of the poem according to Thai principles. :param str text: Thai poem - :param int k_type: Type of Thai poem - :return: the check of the suitability of the poem according to Thai principles. + :param int k_type: type of Thai poem + :return: the check results of the suitability of the poem according to Thai principles. :rtype: Union[List[str], str] :Example: @@ -320,7 +316,7 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: # output: The poem is correct according to the principle. print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) - # # -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] + # # -> ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] """ if k_type == 8: try: @@ -332,9 +328,8 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent,engine='dict') - # print(i) if len(sub_sent) > 10: - error.append('In the sentence'+str(i+2)+'there are more than 10 words.'+str(sub_sent)) + error.append('In sentence '+str(i+2)+', there are more than 10 words. '+str(sub_sent)) if (i+1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i+1) % 4 == 2: @@ -345,27 +340,27 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: elif (i+1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1): - return 'The poem does not complete 4 sentences.' + return 'The poem does not have 4 complete sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: - if self.is_sumpus(list_sumpus_sent1[i],j) == False: - countwrong +=1 + if self.is_sumpus(list_sumpus_sent1[i], j) is False: + countwrong +=1 if countwrong > 3: - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+'in paragraph '+str(i+1)) - if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent3[i]) == False: + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+' in paragraph '+str(i+1)) + if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: # print(sumpus_sent2l,sumpus_sent3) - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+'in paragraph '+str(i+1)) + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+' in paragraph '+str(i+1)) if i > 0: - if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent4[i-1]) == False: - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+'in paragraph '+str(i+1)) - if error == []: + if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent4[i-1]) is False: + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+' in paragraph '+str(i+1)) + if not error: return 'The poem is correct according to the principle.' else: return error except: - return 'Something went wrong Make sure you enter it in correct form of klon 8.' + return 'Something went wrong. Make sure you enter it in the correct form of klon 8.' elif k_type == 4: try: error = [] @@ -377,7 +372,7 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent,engine='dict') if len(sub_sent) > 5: - error.append('In the sentence'+str(i+2)+'there are more than 4 words.'+str(sub_sent)) + error.append('In sentence '+str(i+2)+', there are more than 4 words. '+str(sub_sent)) if (i+1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i+1) % 4 == 2: @@ -389,39 +384,39 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: elif (i+1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1): - return 'The poem does not complete 4 sentences.' + return 'The poem does not have 4 complete sentences.' else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: # print(list_sumpus_sent1[i],j) - if self.is_sumpus(list_sumpus_sent1[i],j) == False: - countwrong +=1 + if self.is_sumpus(list_sumpus_sent1[i], j) is False: + countwrong +=1 if countwrong > 1: - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+'in paragraph '+str(i+1)) - if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent3[i]) == False: + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent1[i],list_sumpus_sent2h[i]))+'in paragraph '+str(i+1)) + if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: # print(sumpus_sent2l,sumpus_sent3) - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+'in paragraph '+str(i+1)) + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent3[i]))+'in paragraph '+str(i+1)) if i > 0: - if self.is_sumpus(list_sumpus_sent2l[i],list_sumpus_sent4[i-1]) == False: - error.append('Cant find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+'in paragraph '+str(i+1)) - if error == []: + if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent4[i-1]) is False: + error.append('Can\'t find rhyme between paragraphs '+str((list_sumpus_sent2l[i],list_sumpus_sent4[i-1]))+' in paragraph '+str(i+1)) + if not error: return 'The poem is correct according to the principle.' else: return error except: - return 'Something went wrong Make sure you enter it in correct form.' - + return 'Something went wrong. Make sure you enter it in the correct form.' + else: - return 'Something went wrong Make sure you enter it in correct form.' - + return 'Something went wrong. Make sure you enter it in the correct form.' + def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = False) -> Union[List[bool], List[str], bool, str]: """ - Thai tonal word checker + Checker of Thai tonal words :param Union[List[str], str] text: Thai word or list of Thai words :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek - :return: the check if the word is aek or too or False(not both) or list of the check if input is list + :return: the check result if the word is aek or too or False (not both) or list of check results if input is list :rtype: Union[List[bool], List[str], bool, str] :Example: @@ -457,19 +452,19 @@ def check_aek_too(self, text: Union[List[str], str], dead_syllable_as_aek:bool = def handle_karun_sound_silence(self, word: str) -> str: """ - Handle sound silence in Thai word using '์' character (Karun) - by stripping all the characters before the 'Karun' character that should be silenced + Handle silent sounds in Thai words using '์' character (Karun) + by stripping all characters before the 'Karun' character that should be silenced :param str text: Thai word - :return: Thai word with silence word stripped + :return: Thai word with silent words stripped :rtype: str """ - sound_silenced = True if word.endswith('์') else False + sound_silenced = word.endswith('์') if not sound_silenced: return word thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" locate_silenced = word.rfind('์') - 1 - can_silence_two = True if word[locate_silenced-2] in thai_consonants else False + can_silence_two = word[locate_silenced-2] in thai_consonants cut_off = 2 if can_silence_two else 1 word = word[:locate_silenced + 1 - cut_off] return word diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index b6dfba79c..d0e0400b6 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -19,48 +19,48 @@ print('เพื่อน ล้วน',kv.is_sumpus('เพื่อน','ล้วน')) # False -# การตรวจสอบคำ ครุ ลหุ +# การตรวจสอบคำ ครุ ลหุ print('สรร',kv.check_karu_lahu('สรร')) #karu -# การตรวจสอบคำ ครุ ลหุ +# การตรวจสอบคำ ครุ ลหุ print('ชิชะ',kv.check_karu_lahu('ชิชะ')) # lahu # การตรวจสอบกลอน 8 ที่ถูกฉันทลักษณ์ -print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล +print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล เรื่องฟิสิกส์คณิตศาสตร์เอิร์นอดทน เล่นเกมเก่งลำดับต้นของโรงเรียน -ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน +ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน เหมือนจะเล่นแต่เขายังพากเพียร ในการเรียนการเล่นบ้างคละกันไป นรภัทรพุกกะมานป่านจอมแก่น ทั่วแว่นแคว้นโดนเขาแกล้งไม่สงสัย เรื่องวิศวะเก่งกาจประหลาดใจ เรื่องฟิสิกส์ไร้ผู้ใดมาต่อกร นริศราอีฟเก่งกว่าใครเพื่อน คอยช่วยเตือนเรื่องงานคอยสั่งสอน อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา -ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมนี้เก่งกาจไม่กังขา -เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''',k_type=8)) +ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมนี้เก่งกาจไม่กังขา +เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''', k_type=8)) # -> The poem is correct according to the principle. # การตรวจสอบกลอน 8 ที่ผิดฉันทลักษณ์ -print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล +print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล เรื่องฟิสิกส์คณิตศาสตร์เอิร์นอดทน เล่นเกมเก่งลำดับต้นของโรงเรียน -ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน +ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน เหมือนจะเล่นแต่เขายังพากเพียร ในการเรียนการเล่นบ้างคละกันไป นรภัทรพุกกะมานป่านจอมแก่น ทั่วแว่นแคว้นโดนเขาแกล้งไม่สงสัย เรื่องวิศวะเก่งกาจประหลาดใจ เรื่องฟิสิกส์ไร้ผู้ใดมาต่อไป นริศราอีฟเก่งกว่าใครเพื่อน คอยช่วยเตือนเรื่องงานคอยสั่งสอน อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมเอ่อเก่งกาจไม่กังขา -เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''',k_type=8)) -# -> ["Cant find rhyme between paragraphs ('สอน', 'ไป')in paragraph 4", "Cant find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ'])in paragraph 5"] +เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''', k_type=8)) +# -> ["Cant find rhyme between paragraphs ('สอน', 'ไป') in paragraph 4", "Cant find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ']) in paragraph 5"] # การตรวจสอบกลอน 4 ที่ถูกฉันทลักษณ์ -print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) +print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร''', k_type=4)) # -> The poem is correct according to the principle. # การตรวจสอบกลอน 4 ที่ผิดฉันทลักษณ์ -print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) -# -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] +print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''', k_type=4)) +# -> ["Cant find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2"] # การเช็คคำเอกโท print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง')) diff --git a/pythainlp/parse/core.py b/pythainlp/parse/core.py index e9949e1ad..0d567b01a 100644 --- a/pythainlp/parse/core.py +++ b/pythainlp/parse/core.py @@ -25,24 +25,24 @@ def dependency_parsing( """ Dependency Parsing - :param str text: text to do dependency parsing + :param str text: text to apply dependency parsing to :param str model: model for using with engine \ (for esupar and transformers_ud) :param str tag: output type (str or list) - :param str engine: the name dependency parser + :param str engine: the name of dependency parser :return: str (conllu) or List :rtype: Union[List[List[str]], str] **Options for engine** - * *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \ - with BERT/RoBERTa/DeBERTa model. `GitHub \ + * *esupar* (default) - Tokenizer, POS tagger and Dependency parser \ + using BERT/RoBERTa/DeBERTa models. `GitHub \ `_ - * *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \ - for Thai language, working on Universal Dependencies. \ + * *spacy_thai* - Tokenizer, POS tagger, and dependency parser \ + for the Thai language, using Universal Dependencies. \ `GitHub `_ * *transformers_ud* - TransformersUD \ `GitHub `_ - * *ud_goeswith* - POS-tagging and dependency-parsing with \ + * *ud_goeswith* - POS tagging and dependency parsing \ using `goeswith` for subwords **Options for model (esupar engine)** @@ -50,38 +50,38 @@ def dependency_parsing( `Huggingface \ `_ * *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \ - pre-trained on Thai Wikipedia texts for POS-tagging and \ - dependency-parsing `Huggingface \ + pre-trained on Thai Wikipedia texts for POS tagging and \ + dependency parsing `Huggingface \ `_ * *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \ - pre-trained on Thai Wikipedia texts for POS-tagging and \ - dependency-parsing. (syllable level) `Huggingface \ + pre-trained on Thai Wikipedia texts for POS tagging and \ + dependency parsing. (syllable level) `Huggingface \ `_ * *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \ - pre-trained on Thai Wikipedia texts for POS-tagging \ - and dependency-parsing. (char level) `Huggingface \ + pre-trained on Thai Wikipedia texts for POS tagging \ + and dependency parsing. (char level) `Huggingface \ `_ - If you want to train model for esupar, you can read \ + If you want to train models for esupar, you can read \ `Huggingface `_ **Options for model (transformers_ud engine)** * *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \ DeBERTa(V2) model pretrained on Thai Wikipedia texts \ - for dependency-parsing (head-detection on Universal \ - Dependencies) as question-answering, derived from \ + for dependency parsing (head-detection using Universal \ + Dependencies) and question-answering, derived from \ deberta-base-thai. \ trained by th_blackboard.conll. `Huggingface \ `_ * *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \ roberta model pretrained on Thai Wikipedia texts \ - for dependency-parsing. `Huggingface \ + for dependency parsing. `Huggingface \ `_ **Options for model (ud_goeswith engine)** * *KoichiYasuoka/deberta-base-thai-ud-goeswith* (default) - \ This is a DeBERTa(V2) model pre-trained on Thai Wikipedia \ - texts for POS-tagging and dependency-parsing (using goeswith for subwords) \ + texts for POS tagging and dependency parsing (using goeswith for subwords) \ `Huggingface `_ :Example: diff --git a/pythainlp/parse/esupar_engine.py b/pythainlp/parse/esupar_engine.py index 3e113f502..259cb00b7 100644 --- a/pythainlp/parse/esupar_engine.py +++ b/pythainlp/parse/esupar_engine.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages +esupar: Tokenizer, POS tagger and dependency parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages GitHub: https://github.com/KoichiYasuoka/esupar """ @@ -14,7 +14,7 @@ class Parse: def __init__(self, model: str = "th") -> None: - if model == None: + if model is None: model = "th" self.nlp = esupar.load(model) diff --git a/pythainlp/parse/spacy_thai_engine.py b/pythainlp/parse/spacy_thai_engine.py index 1891bfeab..dabf4caeb 100644 --- a/pythainlp/parse/spacy_thai_engine.py +++ b/pythainlp/parse/spacy_thai_engine.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies. +spacy_thai: Tokenizer, POS tagger, and dependency parser for the Thai language using Universal Dependencies. GitHub: https://github.com/KoichiYasuoka/spacy-thai """ diff --git a/pythainlp/parse/transformers_ud.py b/pythainlp/parse/transformers_ud.py index f871e780c..2adfe2849 100644 --- a/pythainlp/parse/transformers_ud.py +++ b/pythainlp/parse/transformers_ud.py @@ -29,7 +29,7 @@ class Parse: def __init__( self, model: str = "KoichiYasuoka/deberta-base-thai-ud-head" ) -> None: - if model == None: + if model is None: model = "KoichiYasuoka/deberta-base-thai-ud-head" self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForQuestionAnswering.from_pretrained(model) diff --git a/pythainlp/soundex/core.py b/pythainlp/soundex/core.py index d2c84192a..4009e337b 100644 --- a/pythainlp/soundex/core.py +++ b/pythainlp/soundex/core.py @@ -48,7 +48,7 @@ def soundex( * *metasound* - Thai soundex algorithm based on a combination of Metaphone and Soundex proposed by Snae & Brückner [#metasound]_ * *prayut_and_somchaip* - Thai-English Cross-Language Transliterated - Word Retrieval using Soundex Technique [#prayut_and_somchaip]_ + Word Retrieval using Soundex Technique [#prayut_and_somchaip]_ :Example: :: diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index c63f22c7e..fbdd66a3b 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -34,14 +34,14 @@ # silenced _RE_KARANT = re.compile(r"จน์|มณ์|ณฑ์|ทร์|ตร์|[ก-ฮ]์|[ก-ฮ][ะ-ู]์") -# signs, symbols, vowel that has no explicit sound +# signs, symbols, vowel that has no explicit sounds # Paiyannoi, Phinthu, Maiyamok, Maitaikhu, Nikhahit _RE_SIGN = re.compile(r"[\u0e2f\u0e3a\u0e46\u0e47\u0e4d]") def lk82(text: str) -> str: """ - This function converts Thai text into phonetic code with the a + This function converts Thai text into phonetic code with the Thai soundex algorithm named **LK82** [#lk82]_. :param str text: Thai word @@ -124,11 +124,11 @@ def lk82(text: str) -> str: else: res.append(c.translate(_TRANS2)) # 12. - # 13. remove repetitives + # 13. remove repetitions res2 = [res[0]] for i in range(1, len(res)): if res[i] != res[i - 1]: res2.append(res[i]) - # 14. fill zeros + # 14. fill with zeros return ("".join(res2) + "0000")[:5] diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index 20ab3b9f0..91117721e 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -37,9 +37,9 @@ def metasound(text: str, length: int = 4) -> str: """ This function converts Thai text into phonetic code with the - mactching technique called **MetaSound** + matching technique called **MetaSound** [#metasound]_ (combination between Soundex and Metaphone algorithms). - MetaSound algorithm was developed specifically for Thai language. + MetaSound algorithm was developed specifically for the Thai language. :param str text: Thai text :param int length: preferred length of the MetaSound code (default is 4) diff --git a/pythainlp/soundex/prayut_and_somchaip.py b/pythainlp/soundex/prayut_and_somchaip.py index a00631aff..bb51d73eb 100644 --- a/pythainlp/soundex/prayut_and_somchaip.py +++ b/pythainlp/soundex/prayut_and_somchaip.py @@ -38,7 +38,7 @@ def prayut_and_somchaip(text: str, length: int = 4) -> str: """ This function converts English-Thai Cross-Language Transliterated Word into - phonetic code with the mactching technique called **Soundex** [#prayut_and_somchaip]_. + phonetic code with the matching technique called **Soundex** [#prayut_and_somchaip]_. :param str text: English-Thai Cross-Language Transliterated Word :param int length: preferred length of the Soundex code (default is 4) diff --git a/pythainlp/soundex/sound.py b/pythainlp/soundex/sound.py index a6629c95e..ad11fe933 100644 --- a/pythainlp/soundex/sound.py +++ b/pythainlp/soundex/sound.py @@ -13,21 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import List -from pythainlp.transliterate import pronunciate, transliterate -from pythainlp.tokenize import word_tokenize - import panphon import panphon.distance +from pythainlp.transliterate import pronunciate, transliterate +from pythainlp.tokenize import word_tokenize _ft = panphon.FeatureTable() _dst = panphon.distance.Distance() def _clean_ipa(ipa: str) -> str: """ - Clean IPA by remove tone and remove space between phone + Clean IPA by removing tones and space between phonetic codes :param str ipa: IPA text - :return: IPA that remove tone from the text + :return: IPA with tones removed from the text :rtype: str """ return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip() @@ -37,7 +36,7 @@ def word2audio(word: str) -> str: Convert word to IPA :param str word: Thai word - :return: IPA that remove tone from the text + :return: IPA with tones removed from the text :rtype: str :Example: @@ -58,7 +57,7 @@ def audio_vector(word:str) -> List[List[int]]: Convert audio to vector list :param str word: Thai word - :return: List feature from panphon + :return: List of features from panphon :rtype: List[List[int]] :Example: @@ -77,7 +76,7 @@ def word_approximation(word:str, list_word:List[str]): :param str word: Thai word :param str list_word: Thai word - :return: List of approximation of word (The smaller the value, the closer) + :return: List of approximation of words (The smaller the value, the closer) :rtype: List[str] :Example: diff --git a/pythainlp/spell/__init__.py b/pythainlp/spell/__init__.py index 0cbde611d..c4ed9e821 100644 --- a/pythainlp/spell/__init__.py +++ b/pythainlp/spell/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Spell checking and spelling correction. +Spell checking and correction. """ __all__ = [ diff --git a/pythainlp/spell/core.py b/pythainlp/spell/core.py index a054506bc..71438317a 100644 --- a/pythainlp/spell/core.py +++ b/pythainlp/spell/core.py @@ -24,16 +24,16 @@ def spell(word: str, engine: str = "pn") -> List[str]: """ - Provides a list of possible correct spelling of the given word. + Provides a list of possible correct spellings of the given word. The list of words are from the words in the dictionary that incurs an edit distance value of 1 or 2. The result is a list of words sorted by their occurrences in the spelling dictionary in descending order. - :param str word: Word to spell check + :param str word: Word to check spell of :param str engine: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) - * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. * *tltk* - wrapper for `TLTK `_. @@ -92,10 +92,10 @@ def correct(word: str, engine: str = "pn") -> str: Corrects the spelling of the given word by returning the correctly spelled word. - :param str word: word to correct spelling + :param str word: word to correct spelling of :param str engine: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) - * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. * *wanchanberta_thai_grammarly* - WanchanBERTa Thai Grammarly :return: the corrected word @@ -133,7 +133,7 @@ def correct(word: str, engine: str = "pn") -> str: from pythainlp.spell.wanchanberta_thai_grammarly import correct as SPELL_CHECKER text_correct = SPELL_CHECKER(word) - + else: text_correct = DEFAULT_SPELL_CHECKER.correct(word) @@ -142,14 +142,14 @@ def correct(word: str, engine: str = "pn") -> str: def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]: """ - Provides a list of possible correct spelling of sentence + Provides a list of possible correct spellings of sentence - :param List[str] list_words: list word of sentence + :param List[str] list_words: list of words in sentence :param str engine: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) - * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. - :return: list of possible correct words + :return: list of possibly correct words :rtype: List[List[str]] :Example: @@ -180,15 +180,15 @@ def spell_sent(list_words: List[str], engine: str = "pn") -> List[List[str]]: def correct_sent(list_words: List[str], engine: str = "pn") -> List[str]: """ - Corrects the spelling of the given sentence by returning + Corrects and returns the spelling of the given sentence - :param List[str] list_words: list word of sentence + :param List[str] list_words: list of words in sentence :param str engine: * *pn* - Peter Norvig's algorithm [#norvig_spellchecker]_ (default) - * *phunspell* - A spell checker utilizing spylls a port of Hunspell. + * *phunspell* - A spell checker utilizing spylls, a port of Hunspell. * *symspellpy* - symspellpy is a Python port of SymSpell v6.5. * *wanchanberta_thai_grammarly* - WanchanBERTa Thai Grammarly - :return: the corrected list sentences of word + :return: the corrected list of words in sentence :rtype: List[str] :Example: diff --git a/pythainlp/spell/phunspell.py b/pythainlp/spell/phunspell.py index 5c0d49ca9..de53444ca 100644 --- a/pythainlp/spell/phunspell.py +++ b/pythainlp/spell/phunspell.py @@ -15,7 +15,7 @@ """ Phunspell -A pure Python spell checker utilizing spylls a port of Hunspell. +A pure Python spell checker utilizing spylls, a port of Hunspell. :See Also: * \ diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py index 9bd8b39f9..4cdfab5d7 100644 --- a/pythainlp/spell/pn.py +++ b/pythainlp/spell/pn.py @@ -46,7 +46,7 @@ def _keep( dict_filter: Callable[[str], bool], ) -> bool: """ - Checks whether a given word has the required minimum frequency of min_freq + Checks whether a given word has the required minimum frequency min_freq and its character length is between min_len and max_len (inclusive). """ if not word_freq or word_freq[1] < min_freq: @@ -61,7 +61,7 @@ def _keep( def _edits1(word: str) -> Set[str]: """ - Returns a set of words with edit distance of 1 from the input word + Returns a set of words with an edit distance of 1 from the input word """ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] @@ -74,7 +74,7 @@ def _edits1(word: str) -> Set[str]: def _edits2(word: str) -> Set[str]: """ - Returns a set of words with edit distance of 2 from the input word + Returns a set of words with an edit distance of 2 from the input word """ return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1)) @@ -92,12 +92,12 @@ def _convert_custom_dict( Converts a custom dictionary to a list of (str, int) tuples """ if isinstance(custom_dict, dict): - custom_dict = [(word, freq) for word, freq in custom_dict.items()] + custom_dict = list(custom_dict.items()) i = iter(custom_dict) first_member = next(i) if isinstance(first_member, str): - # create tuples of a word with frequency equal to 1, + # create tuples of a word with frequency equaling 1, # and filter word list custom_dict = [ (word, 1) @@ -138,7 +138,7 @@ def __init__( `Thai National Corpus `_ Basically, Norvig's spell checker will choose the most likely - spelling correction give a word by searching for candidate + corrected spelling given a word by searching for candidates of corrected words based on edit distance. Then, it selects the candidate with the highest word occurrence probability. @@ -146,7 +146,7 @@ def __init__( :param str custom_dict: A custom spelling dictionary. This can be: (1) a dictionary (`dict`), with words (`str`) as keys and frequencies (`int`) as values; - (2) an iterable (list, tuple, or set) of word + (2) an iterable (list, tuple, or set) of words (`str`) and frequency (`int`) tuples: `(str, int)`; or (3) an iterable of just words (`str`), without @@ -161,12 +161,12 @@ def __init__( (default = 40) :param func dict_filter: A function to filter the dictionary. Default filter removes any word - with number or non-Thai characters. + with numbers or non-Thai characters. If no filter is required, use None. """ if not custom_dict: # default, use Thai National Corpus # TODO: #680 change the dict - custom_dict = [(i,j) for i,j in tnc.word_freqs()] + custom_dict = [(i, j) for i, j in tnc.word_freqs()] if not dict_filter: dict_filter = _no_filter @@ -201,12 +201,12 @@ def dictionary(self) -> ItemsView[str, int]: def known(self, words: Iterable[str]) -> List[str]: """ - Returns a list of given words that found in the spelling dictionary + Returns a list of given words found in the spelling dictionary :param list[str] words: A list of words to check if they exist in the spelling dictionary - :return: intersection of the given words list and words + :return: intersection of the given word list and words in the spelling dictionary :rtype: list[str] @@ -233,7 +233,7 @@ def prob(self, word: str) -> float: Returns the probability of an input word, according to the spelling dictionary - :param str word: A word to check its probability of occurrence + :param str word: A word to check occurrence probability of :return: word occurrence probability :rtype: float @@ -261,7 +261,7 @@ def freq(self, word: str) -> int: Returns the frequency of an input word, according to the spelling dictionary - :param str word: A word to check its frequency + :param str word: A word to check frequency of :return: frequency of the given word in the spelling dictionary :rtype: int @@ -284,20 +284,20 @@ def spell(self, word: str) -> List[str]: """ Returns a list of all correctly-spelled words whose spelling is similar to the given word by edit distance metrics. - The returned list of words will be sorted by the decreasing + The returned list of words will be sorted by decreasing order of word frequencies in the word spelling dictionary. - First, if the input word is spelled-correctly, - this method returns the list of exactly one word which is itself. - Next, this method looks for a list of all correctly-spelled words - whose edit distance value is 1 within the input word. - If there is no such word, that the search expands to + First, if the input word is spelled correctly, + this method returns a list of exactly one word which is itself. + Next, this method looks for a list of all correctly spelled words + whose edit distance value is 1 from the input word. + If there is no such word, then the search expands to a list of words whose edit distance value is 2. - And if that still fails, the list of input word is returned. + And if that still fails, the list of input words is returned. - :param str word: A word to check its spelling + :param str word: A word to check spelling of - :return: list of possible correct words within 1 or 2 edit distance + :return: list of possibly correct words within 1 or 2 edit distance and sorted by frequency of word occurrence in the spelling dictionary in descending order. :rtype: list[str] @@ -335,7 +335,7 @@ def correct(self, word: str) -> str: Returns the most possible word, using the probability from the spelling dictionary - :param str word: A word to correct its spelling + :param str word: A word to correct spelling of :return: the correct spelling of the given word :rtype: str diff --git a/pythainlp/spell/symspellpy.py b/pythainlp/spell/symspellpy.py index e231e7c72..ac750e29e 100644 --- a/pythainlp/spell/symspellpy.py +++ b/pythainlp/spell/symspellpy.py @@ -41,7 +41,7 @@ def spell(text: str, max_edit_distance: int = 2) -> List[str]: return [ - str(i).split(",")[0] + str(i).split(",", maxsplit=1)[0] for i in list( sym_spell.lookup( text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance @@ -56,7 +56,7 @@ def correct(text: str, max_edit_distance: int = 1) -> str: def spell_sent(list_words: List[str], max_edit_distance: int = 2) -> List[str]: _temp = [ - str(i).split(",")[0].split(" ") + str(i).split(",", maxsplit=1)[0].split(" ") for i in list( sym_spell.lookup_compound( " ".join(list_words), diff --git a/pythainlp/spell/wanchanberta_thai_grammarly.py b/pythainlp/spell/wanchanberta_thai_grammarly.py index 9004d8838..380416198 100644 --- a/pythainlp/spell/wanchanberta_thai_grammarly.py +++ b/pythainlp/spell/wanchanberta_thai_grammarly.py @@ -31,7 +31,7 @@ class BertModel(torch.nn.Module): def __init__(self): - super(BertModel, self).__init__() + super().__init__() self.bert = BertForTokenClassification.from_pretrained('bookpanda/wangchanberta-base-att-spm-uncased-tagging') def forward(self, input_id, mask, label): @@ -45,9 +45,7 @@ def forward(self, input_id, mask, label): def align_word_ids(texts): tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True) - c = tokenizer.convert_ids_to_tokens(tokenized_inputs.input_ids) word_ids = tokenized_inputs.word_ids() - previous_word_idx = None label_ids = [] for word_idx in word_ids: @@ -55,11 +53,10 @@ def align_word_ids(texts): label_ids.append(-100) else: try: - label_ids.append(2) + label_ids.append(2) except: label_ids.append(-100) - previous_word_idx = word_idx return label_ids def evaluate_one_text(model, sentence): @@ -67,13 +64,9 @@ def evaluate_one_text(model, sentence): mask = text['attention_mask'][0].unsqueeze(0).to(device) input_id = text['input_ids'][0].unsqueeze(0).to(device) label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device) - # print(f"input_ids: {input_id}") - # print(f"attnetion_mask: {mask}") - # print(f"label_ids: {label_ids}") logits = tagging_model(input_id, mask, None) logits_clean = logits[0][label_ids != -100] - # print(f"logits_clean: {logits_clean}") predictions = logits_clean.argmax(dim=1).tolist() prediction_label = [ids_to_labels[i] for i in predictions] @@ -88,8 +81,6 @@ def correct(text): ans = [] i_f = evaluate_one_text(tagging_model, text) a = tokenizer(text) - b = a['input_ids'] - c = tokenizer.convert_ids_to_tokens(b) i_f_len = len(i_f) for j in range(i_f_len): if i_f[j] == 'i': @@ -117,4 +108,4 @@ def correct(text): final_output = ''.join(final_output) final_output = final_output.replace("▁", " ") final_output = final_output.replace("", "") - return final_output \ No newline at end of file + return final_output diff --git a/pythainlp/summarize/core.py b/pythainlp/summarize/core.py index 26995dcc1..a4187d86c 100644 --- a/pythainlp/summarize/core.py +++ b/pythainlp/summarize/core.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Text summarization and Keyword extraction +Text summarization and keyword extraction """ from typing import List, Iterable, Optional, Tuple @@ -36,14 +36,14 @@ def summarize( """ This function summarizes text based on frequency of words. - Under the hood, this function first tokenize sentence from the given + Under the hood, this function first tokenizes sentences from the given text with :func:`pythainlp.tokenize.sent_tokenize`. - Then, computes frequencies of tokenized words + Then, it computes frequencies of tokenized words (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences - and normalized with maximum word frequency. The words with normalized - frequncy that are less than 0.1 or greater than 0.9 will be + and normalizes them with maximum word frequency. The words with normalized + frequencies that are less than 0.1 or greater than 0.9 will be filtered out from frequency dictionary. Finally, it picks *n* sentences - with highest sum of normalized frequency from all words + with highest sum of normalized frequency from all words which are in the sentence and also appear in the frequency dictionary. :param str text: text to be summarized diff --git a/pythainlp/summarize/keybert.py b/pythainlp/summarize/keybert.py index 4d67a7a8d..015a76699 100644 --- a/pythainlp/summarize/keybert.py +++ b/pythainlp/summarize/keybert.py @@ -122,7 +122,7 @@ def extract_keywords( if not text: return [] - # generate all list of keyword / keyphrases + # generate all lists of keywords / keyphrases stop_words_ = stop_words if stop_words else thai_stopwords() kw_candidates = _generate_ngrams( text, keyphrase_ngram_range, min_df, tokenizer, stop_words_ @@ -144,7 +144,7 @@ def extract_keywords( def embed(self, docs: Union[str, List[str]]) -> np.ndarray: """ - Create an embedding of each input in `docs` by averaging vectors from last hidden layer. + Create an embedding of each input in `docs` by averaging vectors from the last hidden layer. """ embs = self.ft_pipeline(docs) if isinstance(docs, str) or len(docs) == 1: diff --git a/pythainlp/summarize/mt5.py b/pythainlp/summarize/mt5.py index d1b2d8187..a7ebe3aea 100644 --- a/pythainlp/summarize/mt5.py +++ b/pythainlp/summarize/mt5.py @@ -15,8 +15,8 @@ """ Summarization by mT5 model """ -from transformers import T5Tokenizer, MT5ForConditionalGeneration from typing import List +from transformers import T5Tokenizer, MT5ForConditionalGeneration from pythainlp.summarize import CPE_KMUTT_THAI_SENTENCE_SUM diff --git a/pythainlp/tag/_tag_perceptron.py b/pythainlp/tag/_tag_perceptron.py index aa9e3b3f2..9106bfe52 100644 --- a/pythainlp/tag/_tag_perceptron.py +++ b/pythainlp/tag/_tag_perceptron.py @@ -32,7 +32,7 @@ from typing import Dict, Iterable, List, Tuple, Union -class AveragedPerceptron(object): +class AveragedPerceptron(): """ An averaged perceptron, as implemented by Matthew Honnibal. diff --git a/pythainlp/tag/chunk.py b/pythainlp/tag/chunk.py index a67954e8c..40b45d304 100644 --- a/pythainlp/tag/chunk.py +++ b/pythainlp/tag/chunk.py @@ -19,13 +19,13 @@ def chunk_parse( sent: List[Tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp" ) -> List[str]: """ - This function parse thai sentence to phrase structure in IOB format. + This function parses Thai sentence to phrase structure in IOB format. - :param list sent: list [(word,part-of-speech)] + :param list sent: list [(word, part-of-speech)] :param str engine: chunk parse engine (now, it has crf only) :param str corpus: chunk parse corpus (now, it has orchidpp only) - :return: a list of tuple (word,part-of-speech,chunking) + :return: a list of tuples (word, part-of-speech, chunking) :rtype: List[str] :Example: diff --git a/pythainlp/tag/crfchunk.py b/pythainlp/tag/crfchunk.py index 852904559..8b6dd3cfc 100644 --- a/pythainlp/tag/crfchunk.py +++ b/pythainlp/tag/crfchunk.py @@ -12,12 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple from pycrfsuite import Tagger as CRFTagger from pythainlp.corpus import path_pythainlp_corpus, thai_stopwords -def _is_stopword(word: str) -> bool: # check thai stopword +def _is_stopword(word: str) -> bool: # check Thai stopword return word in thai_stopwords() diff --git a/pythainlp/tag/locations.py b/pythainlp/tag/locations.py index c44a2ee8f..b98df23b2 100644 --- a/pythainlp/tag/locations.py +++ b/pythainlp/tag/locations.py @@ -23,12 +23,12 @@ def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]: """ - This function recognize Thailand provinces in text. + This function recognizes Thailand provinces in text. Note that it uses exact match and considers no context. :param list[str] tokens: a list of words - :reutrn: a list of tuple indicating NER for `LOCATION` in IOB format + :return: a list of tuples indicating NER for `LOCATION` in IOB format :rtype: list[tuple[str, str]] :Example: diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index da9fab0f1..7dde484db 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -15,15 +15,14 @@ """ Named-entity recognizer """ -import warnings from typing import List, Tuple, Union class NER: """ - Named-entity recognizer class + Class of named-entity recognizer - :param str engine: Named-entity recognizer engine + :param str engine: engine of named-entity recognizer :param str corpus: corpus **Options for engine** @@ -34,7 +33,7 @@ class NER: **Options for corpus** * *thainer* - Thai NER corpus (default) - **Note**: for tltk engine, It's support ner model from tltk only. + **Note**: The tltk engine supports NER models from tltk only. """ def __init__(self, engine: str = "thainer-v2", corpus: str = "thainer") -> None: @@ -69,18 +68,18 @@ def tag( self, text, pos=False, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named entities in text in IOB format. :param str text: text in Thai to be tagged - :param bool pos: output with part-of-speech tag.\ - (wangchanberta is not support) - :param bool tag: output like html tag. - :return: a list of tuple associated with tokenized word, NER tag, - POS tag (if the parameter `pos` is specified as `True`), - and output like html tag (if the parameter `tag` is + :param bool pos: output with part-of-speech tags.\ + (wangchanberta is not supported) + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is specified as `True`). - Otherwise, return a list of tuple associated with tokenized - word and NER tag + Otherwise, return a list of tuples associated with tokenized + words and NER tags :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] :Example: @@ -105,7 +104,7 @@ class NNER: """ Nested Named Entity Recognition - :param str engine: Nested Named entity recognizer engine + :param str engine: engine of nested named entity recognizer :param str corpus: corpus **Options for engine** @@ -122,11 +121,11 @@ def load_engine(self, engine: str = "thai_nner") -> None: def tag(self, text) -> Tuple[List[str], List[dict]]: """ - This function tags nested named-entitiy. + This function tags nested named entities. :param str text: text in Thai to be tagged - :return: a list of tuple associated with tokenized word, NNER tag. + :return: a list of tuples associated with tokenized words and NNER tags. :rtype: Tuple[List[str], List[dict]] :Example: diff --git a/pythainlp/tag/orchid.py b/pythainlp/tag/orchid.py index ac8532198..59fe1c50c 100644 --- a/pythainlp/tag/orchid.py +++ b/pythainlp/tag/orchid.py @@ -130,7 +130,7 @@ def ud_exception(w: str, tag: str) -> str: - if w == "การ" or w == "ความ": + if w in ("การ", "ความ"): return "NOUN" return tag diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index accb5de67..11460513c 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -49,7 +49,6 @@ def _pud_tagger(): def _blackboard_tagger(): - global _BLACKBOARD_TAGGER if not _BLACKBOARD_TAGGER: path = get_corpus_path(_BLACKBOARD_NAME) _LST20_TAGGER = PerceptronTagger(path=path) @@ -71,15 +70,15 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: to_ud = True word_tags = [] - if corpus == "orchid" or corpus == "orchid_ud": + if corpus in ("orchid", "orchid_ud"): words = orchid.pre_process(words) word_tags = _orchid_tagger().tag(words) word_tags = orchid.post_process(word_tags, to_ud) - elif corpus == "blackboard" or corpus == "blackboard_ud": + elif corpus in ("blackboard", "blackboard_ud"): words = blackboard.pre_process(words) word_tags = _blackboard_tagger().tag(words) word_tags = blackboard.post_process(word_tags, to_ud) - else: # default, use "pud" as a corpus + else: # by default, use "pud" for corpus tagger = _pud_tagger() word_tags = tagger.tag(words) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 90a0d7788..c2e9bbf7f 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -26,9 +26,9 @@ def pos_tag( * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger * *wangchanberta* - wangchanberta model. - * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ - if you choose other corpus, It's change to TNC corpus.) - :param str corpus: the corpus that used to create the language model for tagger + * *tltk* - TLTK: Thai Language Toolkit (support TNC corpora only.\ + If you choose other corpora, they will be converted to TNC corpora.) + :param str corpus: the corpus that is used to create the language model for tagger * *orchid* - `ORCHID \ `_ corpus, \ text from Thai academic articles (default) @@ -142,8 +142,8 @@ def pos_tag_sents( * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ - if you choose other corpus, It's change to TNC corpus.) - :param str corpus: the corpus that used to create the language model for tagger + If you choose other corpora, they will be converted to TNC corpora.) + :param str corpus: the corpus that is used to create the language model for tagger * *orchid* - `ORCHID \ `_ corpus, \ text from Thai academic articles (default) diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py index 62e2453ee..89de031bf 100644 --- a/pythainlp/tag/thainer.py +++ b/pythainlp/tag/thainer.py @@ -25,7 +25,7 @@ from pythainlp.tokenize import word_tokenize from pythainlp.util import isthai -_TOKENIZER_ENGINE = "newmm" # should be the same as one used in training data +_TOKENIZER_ENGINE = "newmm" # should be same as that used in training data def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย @@ -86,10 +86,10 @@ def _doc2features(doc, i) -> Dict: class ThaiNameTagger: """ Thai named-entity recognizer or Thai NER. - This function support Thai NER 1.4 and 1.5 only. + This function supports Thai NER 1.4 and 1.5 only. :param str version: Thai NER version. - It's support Thai NER 1.4 & 1.5. - The defualt value is `1.4 + It supports Thai NER 1.4 & 1.5. + The default value is `1.4 :Example: :: @@ -106,7 +106,7 @@ def __init__(self, version: str = "1.4") -> None: :param str version: Thai NER version. It's support Thai NER 1.4 & 1.5. - The defualt value is `1.4` + The default value is `1.4` """ from pycrfsuite import Tagger as CRFTagger @@ -123,24 +123,24 @@ def get_ner( self, text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entities in text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or - exclude (`False`). The defualt value is `True` - :param bool tag: output like html tag. - :return: a list of tuple associated with tokenized word, NER tag, - POS tag (if the parameter `pos` is specified as `True`), - and output like html tag (if the parameter `tag` is + exclude (`False`). The default value is `True` + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is specified as `True`). - Otherwise, return a list of tuple associated with tokenized - word and NER tag + Otherwise, return a list of tuples associated with tokenized + words and NER tags :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Note: * For the POS tags to be included in the results, this function - uses :func:`pythainlp.tag.pos_tag` with engine as `perceptron` - and corpus as orchid_ud`. + uses :func:`pythainlp.tag.pos_tag` with engine `perceptron` + and corpus `orchid_ud`. :Example: diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index b729629f8..16b2de347 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -39,18 +39,18 @@ def get_ner( """ Named-entity recognizer from **TLTK** - This function tags named-entitiy from text in IOB format. + This function tags named-entities in text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or - exclude (`False`). The defualt value is `True` - :param bool tag: output like html tag. - :return: a list of tuple associated with tokenized word, NER tag, - POS tag (if the parameter `pos` is specified as `True`), - and output like html tag (if the parameter `tag` is + exclude (`False`). The default value is `True` + :param bool tag: output HTML-like tag. + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is specified as `True`). - Otherwise, return a list of tuple associated with tokenized - word and NER tag + Otherwise, return a list of tuples associated with tokenized + words and NER tags :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str :Example: diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index b2300fb1f..ede8dab89 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -85,15 +85,15 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: to_ud = True word_tags = [] - if corpus == "orchid" or corpus == "orchid_ud": + if corpus in ("orchid", "orchid_ud"): words = orchid.pre_process(words) word_tags = _find_tag(words, _orchid_tagger()) word_tags = orchid.post_process(word_tags, to_ud) - elif corpus == "blackboard" or corpus == "blackboard_ud": + elif corpus in ("blackboard", "blackboard_ud"): words = blackboard.pre_process(words) word_tags = _find_tag(words, _blackboard_tagger()) word_tags = blackboard.post_process(word_tags, to_ud) - else: # default, use "pud" as a corpus + else: # by default, use "pud" for corpus word_tags = _find_tag(words, _pud_tagger()) return word_tags diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 348b48957..5ed4de517 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Tokenizers at different level of linguistic analysis. +Tokenizers at different levels of linguistic analysis. """ __all__ = [ diff --git a/pythainlp/tokenize/_utils.py b/pythainlp/tokenize/_utils.py index 931f696e8..9d3395b48 100644 --- a/pythainlp/tokenize/_utils.py +++ b/pythainlp/tokenize/_utils.py @@ -26,7 +26,7 @@ def apply_postprocessors( segments: List[str], postprocessors: Callable[[List[str]], List[str]] ) -> List[str]: """ - A list of callables to apply on a raw segmentation result. + A list of callables to apply to a raw segmentation result. """ for func in postprocessors: segments = func(segments) @@ -38,7 +38,7 @@ def rejoin_formatted_num(segments: List[str]) -> List[str]: """ Rejoin well-known formatted numeric that are over-tokenized. The formatted numeric are numbers separated by ":", ",", or ".", - such as time, decimal number, comma-added number, and IP address. + such as time, decimal numbers, comma-added numbers, and IP addresses. :param List[str] segments: result from word tokenizer :return: a list of fixed tokens diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 5365a97d9..dd08f71f4 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -40,7 +40,7 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: """ Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai :param str text: text to be tokenized to words - :param str model: word tokenizer model to be tokenized to words + :param str model: model of word tokenizer model :return: list of words, tokenized from the text :rtype: list[str] **Options for model** diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index ea33d00df..a19082981 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Tokenizer generic functions +Generic functions of tokenizers """ import re from typing import Iterable, List, Union @@ -38,10 +38,10 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]: """ Clause tokenizer. (or Clause segmentation) Tokenizes running word list into list of clauses (list of strings). - split by CRF trained on Blackboard Treebank. + Split by CRF trained on Blackboard Treebank. - :param str doc: word list to be clause - :return: list of claues + :param str doc: word list to be clause tokenized + :return: list of clauses :rtype: list[list[str]] :Example: :: @@ -63,11 +63,11 @@ def word_detokenize( """ Word detokenizer. - This function will detokenize the list word in each sentence to text. + This function will detokenize the list of words in each sentence into text. - :param str segments: List sentences with list words. + :param str segments: List of sentences, each with a list of words. :param str output: the output type (str or list) - :return: the thai text + :return: the Thai text :rtype: Union[str,List[str]] :Example: :: @@ -90,7 +90,7 @@ def word_detokenize( if j > 0: # previous word p_w = s[j - 1] - # if w is number or other language and not be space + # if w is number or other language and is not space if ( w[0] not in thai_characters and not w.isspace() @@ -98,7 +98,7 @@ def word_detokenize( ): _list_sents.append(" ") _add_index.append(j) - # if previous word is number or other language and not be space + # if previous word is number or other language and is not space elif p_w[0] not in thai_characters and not p_w.isspace(): _list_sents.append(" ") _add_index.append(j) @@ -118,10 +118,7 @@ def word_detokenize( else: _text = [] for i in _list_all: - _temp = "" - for j in i: - _temp += j - _text.append(_temp) + _text.append("".join(i)) return " ".join(_text) @@ -140,9 +137,9 @@ def word_tokenize( :param str text: text to be tokenized :param str engine: name of the tokenizer to be used :param pythainlp.util.Trie custom_dict: dictionary trie - :param bool keep_whitespace: True to keep whitespaces, a common mark + :param bool keep_whitespace: True to keep whitespace, a common mark for end of phrase in Thai. - Otherwise, whitespaces are omitted. + Otherwise, whitespace is omitted. :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated. Otherwise, formatted numeric could be wrongly separated. @@ -162,17 +159,17 @@ def word_tokenize( * *longest* - dictionary-based, longest matching * *mm* - "multi-cut", dictionary-based, maximum matching * *nercut* - dictionary-based, maximal matching, - constrained with Thai Character Cluster (TCC) boundaries, + constrained by Thai Character Cluster (TCC) boundaries, combining tokens that are parts of the same named-entity * *newmm* (default) - "new multi-cut", dictionary-based, maximum matching, - constrained with Thai Character Cluster (TCC) boundaries - with improve the TCC rule that used in newmm. + constrained by Thai Character Cluster (TCC) boundaries + with improved TCC rules that are used in newmm. * *newmm-safe* - newmm, with a mechanism to avoid long - processing time for text with continuous ambiguous breaking points + processing time for text with continuously ambiguous breaking points * *nlpo3* - wrapper for a word tokenizer in `nlpO3 `_., - newmm adaptation in Rust (2.5x faster) + adaptation of newmm in Rust (2.5x faster) * *oskut* - wrapper for `OSKut `_., Out-of-domain StacKed cut for Word Segmentation @@ -187,7 +184,7 @@ def word_tokenize( *deepcut*, *longest*, *newmm*, and *newmm-safe* engines. :Example: - Tokenize text with different tokenizer:: + Tokenize text with different tokenizers:: from pythainlp.tokenize import word_tokenize @@ -199,7 +196,7 @@ def word_tokenize( word_tokenize(text, engine='attacut') # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] - Tokenize text by omiting whitespaces:: + Tokenize text with whitespace omitted:: text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว " @@ -210,7 +207,7 @@ def word_tokenize( word_tokenize(text, engine="newmm", keep_whitespace=False) # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] - Join broken formatted numeric (e.g. time, decimals, IP address):: + Join broken formatted numeric (e.g. time, decimals, IP addresses):: text = "เงิน1,234บาท19:32น 127.0.0.1" @@ -223,7 +220,7 @@ def word_tokenize( # output: # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1'] - Tokenize with default and custom dictionary:: + Tokenize with default and custom dictionaries:: from pythainlp.corpus.common import thai_words from pythainlp.tokenize import dict_trie @@ -251,7 +248,7 @@ def word_tokenize( segments = [] - if engine == "newmm" or engine == "onecut": + if engine in ("newmm", "onecut"): from pythainlp.tokenize.newmm import segment segments = segment(text, custom_dict) @@ -267,7 +264,7 @@ def word_tokenize( from pythainlp.tokenize.longest import segment segments = segment(text, custom_dict) - elif engine == "mm" or engine == "multi_cut": + elif engine in ("mm", "multi_cut"): from pythainlp.tokenize.multi_cut import segment segments = segment(text, custom_dict) @@ -344,21 +341,21 @@ def sent_tokenize( :param str text: the text to be tokenized :param str engine: choose among *'crfcut'*, *'whitespace'*, \ *'whitespace+newline'* - :return: list of splited sentences + :return: list of split sentences :rtype: list[str] **Options for engine** * *crfcut* - (default) split by CRF trained on TED dataset - * *thaisum* - The implementation of sentence segmentator from \ + * *thaisum* - The implementation of sentence segmenter from \ Nakhun Chumpolsathien, 2020 * *tltk* - split by `TLTK `_., * *wtp* - split by `wtpsplitaxe `_., \ - It support many size of models. You can use ``wtp`` to use mini model, \ + It supports many sizes of models. You can use ``wtp`` to use mini model, \ ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ ``wtp-mini`` to use ``wtp-bert-mini`` model, \ ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ and ``wtp-large`` to use ``wtp-canine-s-12l`` model. - * *whitespace+newline* - split by whitespaces and newline. - * *whitespace* - split by whitespaces. Specifiaclly, with \ + * *whitespace+newline* - split by whitespace and newline. + * *whitespace* - split by whitespace, specifically with \ :class:`regex` pattern ``r" +"`` :Example: @@ -447,23 +444,23 @@ def sent_tokenize( def paragraph_tokenize( - text: str, - engine: str = "wtp-mini", - paragraph_threshold:float=0.5, - style:str='newline', - ) -> List[List[str]]: + text: str, + engine: str = "wtp-mini", + paragraph_threshold:float=0.5, + style:str='newline', +) -> List[List[str]]: """ Paragraph tokenizer. - Tokenizes text into paragraph. + Tokenizes text into paragraphs. :param str text: text to be tokenized - :param str engine: the name paragraph tokenizer - :return: list of paragraph + :param str engine: the name of paragraph tokenizer + :return: list of paragraphs :rtype: List[List[str]] **Options for engine** * *wtp* - split by `wtpsplitaxe `_., \ - It support many size of models. You can use ``wtp`` to use mini model, \ + It supports many sizes of models. You can use ``wtp`` to use mini model, \ ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ ``wtp-mini`` to use ``wtp-bert-mini`` model, \ ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ @@ -504,7 +501,7 @@ def paragraph_tokenize( paragraph_threshold=paragraph_threshold, style=style, ) - + else: raise ValueError( f"""Tokenizer \"{engine}\" not found. @@ -519,23 +516,23 @@ def subword_tokenize( keep_whitespace: bool = True, ) -> List[str]: """ - Subword tokenizer. Can be smaller than syllable. + Subword tokenizer for tokenizing text into units smaller than syllables. Tokenizes text into inseparable units of - Thai contiguous characters namely + Thai contiguous characters, namely `Thai Character Clusters (TCCs) \ `_ - TCCs are the units based on Thai spelling feature that could not be - separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'. + TCCs are units based on Thai spelling features that could not be + separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'. If the following units are separated, they could not be spelled out. - This function apply the TCC rules to tokenizes the text into + This function applies TCC rules to tokenize the text into the smallest units. For example, the word 'ขนมชั้น' would be tokenized into 'ข', 'น', 'ม', and 'ชั้น'. :param str text: text to be tokenized - :param str engine: the name subword tokenizer + :param str engine: the name of subword tokenizer :param bool keep_whitespace: keep whitespace :return: list of subwords :rtype: List[str] @@ -548,13 +545,13 @@ def subword_tokenize( * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \ `_. * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000) - * *tcc_p* - Thai Character Cluster + improve the rule that used in newmm + * *tcc_p* - Thai Character Cluster + improved rules that are used in newmm * *tltk* - syllable tokenizer from tltk. See `tltk \ `_. * *wangchanberta* - SentencePiece from wangchanberta model :Example: - Tokenize text into subword based on *tcc*:: + Tokenize text into subwords based on *tcc*:: from pythainlp.tokenize import subword_tokenize @@ -570,7 +567,7 @@ def subword_tokenize( # output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก', 'และ', 'พัฒ','นา', 'กา', 'ร'] - Tokenize text into subword based on *etcc*:: + Tokenize text into subwords based on *etcc*:: text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" text_2 = "ความแปลกแยกและพัฒนาการ" @@ -581,7 +578,7 @@ def subword_tokenize( subword_tokenize(text_2, engine='etcc') # output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ'] - Tokenize text into subword based on *wangchanberta*:: + Tokenize text into subwords based on *wangchanberta*:: text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง" text_2 = "ความแปลกแยกและพัฒนาการ" @@ -625,7 +622,7 @@ def subword_tokenize( It might be a typo; if not, please consult our document.""" ) - if segments == []: + if not segments: segments = segment(text) if not keep_whitespace: @@ -643,10 +640,10 @@ def syllable_tokenize( Syllable tokenizer Tokenizes text into inseparable units of - Thai syllable. + Thai syllables. :param str text: text to be tokenized - :param str engine: the name syllable tokenizer + :param str engine: the name of syllable tokenizer :param bool keep_whitespace: keep whitespace :return: list of subwords :rtype: List[str] @@ -674,11 +671,11 @@ def syllable_tokenize( class Tokenizer: """ - Tokenizer class, for a custom tokenizer. + Tokenizer class for a custom tokenizer. This class allows users to pre-define custom dictionary along with tokenizer and encapsulate them into one single object. - It is an wrapper for both two functions including + It is an wrapper for both functions, that are :func:`pythainlp.tokenize.word_tokenize`, and :func:`pythainlp.util.dict_trie` @@ -710,8 +707,8 @@ class Tokenizer: # ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', # 'ผิดปกติ', 'ของ', 'การ', 'พูด'] - Tokenizer object instantiated with a file path containing list of - word separated with *newline* and explicitly set a new tokenizer + Tokenizer object instantiated with a file path containing a list of + words separated with *newline* and explicitly setting a new tokenizer after initiation:: PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt' @@ -722,7 +719,7 @@ class Tokenizer: text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" - # initate an object from file with `attacut` as tokenizer + # initiate an object from file with `attacut` as tokenizer _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\ engine='attacut') @@ -752,9 +749,9 @@ def __init__( :param str custom_dict: a file path, a list of vocaburaies* to be used to create a trie, or an instantiated :class:`pythainlp.util.Trie` object. - :param str engine: choose between different options of engine to token + :param str engine: choose between different options of tokenizer engines (i.e. *newmm*, *mm*, *longest*, *deepcut*) - :param bool keep_whitespace: True to keep whitespaces, a common mark + :param bool keep_whitespace: True to keep whitespace, a common mark for end of phrase in Thai """ self.__trie_dict = None @@ -793,7 +790,7 @@ def set_tokenize_engine(self, engine: str) -> None: """ Set the tokenizer's engine. - :param str engine: choose between different options of engine to token + :param str engine: choose between different options of tokenizer engines (i.e. *newmm*, *mm*, *longest*, *deepcut*) """ self.__engine = engine diff --git a/pythainlp/tokenize/crfcut.py b/pythainlp/tokenize/crfcut.py index 7a0ded7ed..4c8b67736 100644 --- a/pythainlp/tokenize/crfcut.py +++ b/pythainlp/tokenize/crfcut.py @@ -16,7 +16,7 @@ CRFCut - Thai sentence segmenter. Thai sentence segmentation using conditional random field, -default model trained on TED dataset +with default model trained on TED dataset Performance: - ORCHID - space-correct accuracy 87% vs 95% state-of-the-art @@ -143,7 +143,7 @@ def extract_features( Extract features for CRF by sliding `max_n_gram` of tokens for +/- `window` from the current token - :param List[str] doc: tokens from which features are to be extracted from + :param List[str] doc: tokens from which features are to be extracted :param int window: size of window before and after the current token :param int max_n_gram: create n_grams from 1-gram to `max_n_gram`-gram \ within the `window` @@ -199,7 +199,7 @@ def segment(text: str) -> List[str]: """ CRF-based sentence segmentation. - :param str text: text to be tokenized to sentences + :param str text: text to be tokenized into sentences :return: list of words, tokenized from the text """ if isinstance(text, str): diff --git a/pythainlp/tokenize/etcc.py b/pythainlp/tokenize/etcc.py index f5dc495a7..eb3740d32 100644 --- a/pythainlp/tokenize/etcc.py +++ b/pythainlp/tokenize/etcc.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Segmenting text to Enhanced Thai Character Cluster (ETCC) +Segmenting text into Enhanced Thai Character Clusters (ETCCs) Python implementation by Wannaphong Phatthiyaphaibun This implementation relies on a dictionary of ETCC created from etcc.txt @@ -66,7 +66,7 @@ def segment(text: str) -> List[str]: longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. - :param str text: text to be tokenized to character clusters + :param str text: text to be tokenized into character clusters :return: list of clusters, tokenized from the text :return: List[str] """ diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py index c17da83a1..3f180fc54 100644 --- a/pythainlp/tokenize/han_solo.py +++ b/pythainlp/tokenize/han_solo.py @@ -43,16 +43,15 @@ class Featurizer: # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# { +# { # "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1 # } - + def __init__(self, N=2, sequence_size=1, delimiter=None): self.N = N self.delimiter = delimiter self.radius = N + sequence_size - pass - + def pad(self, sentence, padder='#'): return padder * (self.radius) + sentence + padder * (self.radius) @@ -96,7 +95,7 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') features[left_key] = 1 else: features.append(left_key) - + abs_index_right += 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5) char_right = sentence[abs_index_right] while char_right == self.delimiter: @@ -110,9 +109,9 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') features[right_key] = 1 else: features.append(right_key) - + counter += 1 - + chars = chars_left + chars_right for i in range(0, len(chars) - self.N + 1): ngram = chars[i:i + self.N] @@ -122,10 +121,10 @@ def featurize(self, sentence, padding=True, indiv_char=True, return_type='list') else: features.append(ngram_key) all_features.append(features) - if(return_type == 'list'): - cut = str(cut) + if return_type == 'list': + cut = str(cut) all_labels.append(cut) - + return { 'X': all_features, 'Y': all_labels diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index c652902c5..722c47d99 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Dictionary-based longest-matching Thai word segmentation. Implementation based -on the code from Patorn Utenpattanun. +Dictionary-based longest-matching Thai word segmentation. Implementation is based +on the codes from Patorn Utenpattanun. :See Also: * `GitHub Repository \ @@ -40,7 +40,7 @@ _UNKNOWN = False -class LongestMatchTokenizer(object): +class LongestMatchTokenizer(): def __init__(self, trie: Trie): self.__trie = trie @@ -144,7 +144,7 @@ def segment( """ Dictionary-based longest matching word segmentation. - :param str text: text to be tokenized to words + :param str text: text to be tokenized into words :param pythainlp.util.Trie custom_dict: dictionary for tokenization :return: list of words, tokenized from the text """ diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 45c077ce0..896ca8b0a 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -14,7 +14,7 @@ # limitations under the License. """ Multi cut -- Thai word segmentation with maximum matching. -Original code from Korakot Chaovavanich. +Original codes from Korakot Chaovavanich. :See Also: * `Facebook post \ @@ -50,9 +50,9 @@ def __init__(self, value, multi=None, in_dict=True): _RE_NONTHAI = r"""(?x) [-a-zA-Z]+| # Latin characters -\d+([,\.]\d+)*| # number -[ \t]+| # space -\r?\n # newline +\d+([,\.]\d+)*| # numbers +[ \t]+| # spaces +\r?\n # newlines """ _PAT_NONTHAI = re.compile(_RE_NONTHAI) @@ -94,7 +94,7 @@ def serialize(p, p2): # helper function last_p = q0 elif len_q == 0: # len(q) == 0 means not found in dictionary m = _PAT_NONTHAI.match(text[p:]) - if m: # non-Thai toekn + if m: # non-Thai token i = p + m.span()[1] else: # non-Thai token, find minimum skip for i in range(p, len_text): diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index 70baaccaa..99f7e381b 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -15,9 +15,9 @@ """ nercut 0.2 -Dictionary-based maximal matching word segmentation, constrained with +Dictionary-based maximal matching word segmentation, constrained by Thai Character Cluster (TCC) boundaries, and combining tokens that are -parts of the same named-entity. +parts of the same named entity. Code by Wannaphong Phatthiyaphaibun """ @@ -41,13 +41,13 @@ def segment( tagger=_thainer, ) -> List[str]: """ - Dictionary-based maximal matching word segmentation, constrained with + Dictionary-based maximal matching word segmentation, constrained by Thai Character Cluster (TCC) boundaries, and combining tokens that are parts of the same named-entity. - :param str text: text to be tokenized to words - :parm list taglist: a list of named-entity tags to be used - :parm class tagger: ner tagger engine + :param str text: text to be tokenized into words + :param list taglist: a list of named entity tags to be used + :param class tagger: NER tagger engine :return: list of words, tokenized from the text """ if not isinstance(text, str): diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 0d9627138..708c5efdd 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Dictionary-based maximal matching word segmentation, constrained with -Thai Character Cluster (TCC) boundaries with improve the rules. +Dictionary-based maximal matching word segmentation, constrained by +Thai Character Cluster (TCC) boundaries with improved rules. -The code is based on the notebooks created by Korakot Chaovavanich, -with heuristic graph size limit added to avoid exponential wait time. +The codes are based on the notebooks created by Korakot Chaovavanich, +with heuristic graph size limit added to avoid exponential waiting time. :See Also: * \ @@ -39,9 +39,9 @@ _PAT_NONTHAI = re.compile( r"""(?x) [-a-zA-Z]+| # Latin characters -\d+([,\.]\d+)*| # number -[ \t]+| # space -\r?\n # newline +\d+([,\.]\d+)*| # numbers +[ \t]+| # spaces +\r?\n # newlines """ ) @@ -78,12 +78,12 @@ def _bfs_paths_graph( def _onecut(text: str, custom_dict: Trie) -> Generator[str, None, None]: # main data structure: - # - key is begin position (int) - # - value is possible end positions (List[int]) + # - key is beginning position (int) + # - value is possible ending positions (List[int]) # if key is not found, value is empty list graph = defaultdict(list) - graph_size = 0 # keep track of graph size, if too big will force cutoff + graph_size = 0 # keep track of graph size, if too big, force cutoff valid_poss = tcc_pos(text) # breaking positions that are TCC-valid @@ -151,10 +151,10 @@ def segment( custom_dict: Trie = DEFAULT_WORD_DICT_TRIE, safe_mode: bool = False, ) -> List[str]: - """Maximal-matching word segmentation, Thai Character Cluster constrained. + """Maximal-matching word segmentation constrained by Thai Character Cluster. A dictionary-based word segmentation using maximal matching algorithm, - constrained to Thai Character Cluster boundaries. + constrained by Thai Character Cluster boundaries. A custom dictionary can be supplied. @@ -163,7 +163,7 @@ def segment( :param custom_dict: tokenization dictionary,\ defaults to DEFAULT_WORD_DICT_TRIE :type custom_dict: Trie, optional - :param safe_mode: reduce chance for long processing time in long text\ + :param safe_mode: reduce chance for long processing time for long text\ with many ambiguous breaking points, defaults to False :type safe_mode: bool, optional :return: list of tokens @@ -179,12 +179,12 @@ def segment( return list(_onecut(text, custom_dict)) # if the text is longer than the limit, - # breaks them into smaller chunks then tokenizes each chunk + # break them into smaller chunks, then tokenize each chunk text_parts = [] while len(text) >= _TEXT_SCAN_END: sample = text[_TEXT_SCAN_BEGIN:_TEXT_SCAN_END] - # find possible break positions + # find possible breaking positions cut_pos = _TEXT_SCAN_END # try to break by space first @@ -212,7 +212,7 @@ def segment( if len(text): text_parts.append(text) - # tokenizes each text parts + # tokenizes each text part tokens = [] for text_part in text_parts: tokens.extend(list(_onecut(text_part, custom_dict))) diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py index 314a59dfa..fceeab73e 100644 --- a/pythainlp/tokenize/nlpo3.py +++ b/pythainlp/tokenize/nlpo3.py @@ -29,12 +29,12 @@ def load_dict(file_path: str, dict_name: str) -> bool: """Load a dictionary file into an in-memory dictionary collection. - The loaded dictionary will be accessible throught the assigned dict_name. + The loaded dictionary will be accessible through the assigned dict_name. *** This function does not override an existing dict name. *** :param file_path: Path to a dictionary file :type file_path: str - :param dict_name: A unique dictionary name, use for reference. + :param dict_name: A unique dictionary name, used for reference. :type dict_name: str :return bool @@ -61,7 +61,7 @@ def segment( :param str text: text to be tokenized :param str custom_dict: dictionary name, as assigned with load_dict(),\ defaults to pythainlp/corpus/common/words_th.txt - :param bool safe_mode: reduce chance for long processing time in long text\ + :param bool safe_mode: reduce chance for long processing time for long text\ with many ambiguous breaking points, defaults to False :param bool parallel_mode: Use multithread mode, defaults to False diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 6c5730288..d6dff991a 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -15,7 +15,7 @@ """ Wrapper for PyICU word segmentation. This wrapper module uses :class:`icu.BreakIterator` with Thai as :class:`icu.Local` -to locate boundaries between words from the text. +to locate boundaries between words in the text. :See Also: * `GitHub repository `_ @@ -37,7 +37,7 @@ def _gen_words(text: str) -> str: def segment(text: str) -> List[str]: """ - :param str text: text to be tokenized to words + :param str text: text to be tokenized into words :return: list of words, tokenized from the text """ if not text or not isinstance(text, str): diff --git a/pythainlp/tokenize/tcc.py b/pythainlp/tokenize/tcc.py index 2ea01b8c0..dbac2fd6a 100644 --- a/pythainlp/tokenize/tcc.py +++ b/pythainlp/tokenize/tcc.py @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -The implementation of tokenizer accorinding to Thai Character Clusters (TCCs) -rules purposed by `Theeramunkong et al. 2000. \ +The implementation of tokenizer according to Thai Character Clusters (TCCs) +rules proposed by `Theeramunkong et al. 2000. \ `_ Credits: @@ -72,9 +72,9 @@ def tcc(text: str) -> str: """ - TCC generator, generates Thai Character Clusters + TCC generator which generates Thai Character Clusters - :param str text: text to be tokenized to character clusters + :param str text: text to be tokenized into character clusters :return: subwords (character clusters) :rtype: Iterator[str] """ @@ -97,8 +97,8 @@ def tcc_pos(text: str) -> Set[int]: """ TCC positions - :param str text: text to be tokenized to character clusters - :return: list of the end position of subwords + :param str text: text to be tokenized into character clusters + :return: list of the ending position of subwords :rtype: set[int] """ if not text or not isinstance(text, str): @@ -117,7 +117,7 @@ def segment(text: str) -> List[str]: """ Subword segmentation - :param str text: text to be tokenized to character clusters + :param str text: text to be tokenized into character clusters :return: list of subwords (character clusters), tokenized from the text :rtype: list[str] diff --git a/pythainlp/tokenize/tcc_p.py b/pythainlp/tokenize/tcc_p.py index c9b1bc329..9dade4d5b 100644 --- a/pythainlp/tokenize/tcc_p.py +++ b/pythainlp/tokenize/tcc_p.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -The implementation of tokenizer accorinding to Thai Character Clusters (TCCs) -rules purposed by `Theeramunkong et al. 2000. \ +The implementation of tokenizer according to Thai Character Clusters (TCCs) +rules proposed by `Theeramunkong et al. 2000. \ `_ -and improve the rule that used in newmm +and improved rules that are used in newmm Credits: * TCC: Jakkrit TeCho @@ -72,9 +72,9 @@ def tcc(text: str) -> str: """ - TCC generator, generates Thai Character Clusters + TCC generator which generates Thai Character Clusters - :param str text: text to be tokenized to character clusters + :param str text: text to be tokenized into character clusters :return: subwords (character clusters) :rtype: Iterator[str] """ @@ -97,8 +97,8 @@ def tcc_pos(text: str) -> Set[int]: """ TCC positions - :param str text: text to be tokenized to character clusters - :return: list of the end position of subwords + :param str text: text to be tokenized into character clusters + :return: list of the ending position of subwords :rtype: set[int] """ if not text or not isinstance(text, str): @@ -117,7 +117,7 @@ def segment(text: str) -> List[str]: """ Subword segmentation - :param str text: text to be tokenized to character clusters + :param str text: text to be tokenized into character clusters :return: list of subwords (character clusters), tokenized from the text :rtype: list[str] diff --git a/pythainlp/tokenize/thaisumcut.py b/pythainlp/tokenize/thaisumcut.py index e1b39f42d..eb12144e7 100644 --- a/pythainlp/tokenize/thaisumcut.py +++ b/pythainlp/tokenize/thaisumcut.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ The implementation of sentence segmentator from Nakhun Chumpolsathien, 2020 -original code from: https://github.com/nakhunchumpolsathien/ThaiSum +original codes are from: https://github.com/nakhunchumpolsathien/ThaiSum Cite: diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 2bcbd4183..c0c29fd1b 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -40,30 +40,30 @@ def _tokenize( return _MODEL.split(text,lang_code=lang_code) else: # Paragraph if style=='newline': - return _MODEL.split( - text, - lang_code=lang_code, - do_paragraph_segmentation=True, - paragraph_threshold=paragraph_threshold - ) + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True, + paragraph_threshold=paragraph_threshold + ) elif style=='opus100': - return _MODEL.split( - text, - lang_code=lang_code, - do_paragraph_segmentation=True, - threshold=paragraph_threshold, - style=style, - ) + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True, + threshold=paragraph_threshold, + style=style, + ) else: - raise ValueError( - f"""Segmentation style \"{style}\" not found. - It might be a typo; if not, please consult our document.""" + raise ValueError( + f"""Segmentation style \"{style}\" not found. + It might be a typo; if not, please consult our document.""" ) def tokenize( - text:str, - size:str="mini", - tokenize:str="sentence", + text:str, + size:str="mini", + tokenize:str="sentence", paragraph_threshold:float=0.5, style:str='newline', )-> List[str]: @@ -77,9 +77,9 @@ def tokenize( else: # mini _model_load="wtp-bert-mini" return _tokenize( - text, - model=_model_load, - tokenize=tokenize, - paragraph_threshold=paragraph_threshold, - style=style, - ) + text, + model=_model_load, + tokenize=tokenize, + paragraph_threshold=paragraph_threshold, + style=style, + ) diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py index 359a181e6..b45db1a61 100644 --- a/pythainlp/tools/misspell.py +++ b/pythainlp/tools/misspell.py @@ -83,7 +83,7 @@ def find_misspell_candidates(char: str, verbose: bool = False): valid_neighbours = find_neighbour_locations(loc, char) chars = [] - printing_locations = ["▐"] * 3 + [char] + ["​▐"] * 3 + printing_locations = ["▐"] * 3 + [char] + ["▐"] * 3 for language_ix, is_shift, row, pos, char in valid_neighbours: try: @@ -104,7 +104,7 @@ def find_misspell_candidates(char: str, verbose: bool = False): else: continue printing_locations[ix] = char - except IndexError as e: + except IndexError: continue except Exception as e: print("Something wrong with: ", char) @@ -115,13 +115,13 @@ def find_misspell_candidates(char: str, verbose: bool = False): def misspell(sentence: str, ratio: float = 0.05): """ - Simulate some mispellings for the input sentence. - The number of mispelled locations is governed by ratio. + Simulate some misspellings of the input sentence. + The number of misspelled locations is governed by ratio. - :params str sentence: sentence to be mispelled + :params str sentence: sentence to be misspelled :params float ratio: number of misspells per 100 chars. Defaults to 0.5. - :return: sentence containing some misspelled + :return: sentence containing some misspelled words :rtype: str :Example: diff --git a/pythainlp/tools/path.py b/pythainlp/tools/path.py index 3ba2f21db..42ef002e0 100644 --- a/pythainlp/tools/path.py +++ b/pythainlp/tools/path.py @@ -72,9 +72,9 @@ def get_pythainlp_data_path() -> str: def get_pythainlp_path() -> str: """ - This function returns full path of PyThaiNLP code + This function returns full path of PyThaiNLP codes - :return: full path of :mod:`pythainlp` code + :return: full path of :mod:`pythainlp` codes :rtype: str :Example: diff --git a/pythainlp/translate/core.py b/pythainlp/translate/core.py index edcf18069..7a433725c 100644 --- a/pythainlp/translate/core.py +++ b/pythainlp/translate/core.py @@ -25,11 +25,11 @@ def __init__( """ :param str src_lang: source language :param str target_lang: target language - :param str engine: Machine Translation engine - :param bool use_gpu: load model to gpu (Default is False) + :param str engine: machine translation engine + :param bool use_gpu: load model using GPU (Default is False) **Options for engine* - * *default* - The engine default by each a language. + * *default* - The default engine for each language. * *small100* - A multilingual machine translation model (covering 100 languages) **Options for source & target language** diff --git a/pythainlp/translate/en_th.py b/pythainlp/translate/en_th.py index 56cc10506..4266a77da 100644 --- a/pythainlp/translate/en_th.py +++ b/pythainlp/translate/en_th.py @@ -20,15 +20,12 @@ Website: https://airesearch.in.th/releases/machine-translation-models/ """ import os -import tarfile -from collections import defaultdict - -from pythainlp.corpus import download, get_corpus_path -from pythainlp.tools import get_full_data_path, get_pythainlp_data_path from fairseq.models.transformer import TransformerModel from sacremoses import MosesTokenizer +from pythainlp.corpus import download, get_corpus_path + _EN_TH_MODEL_NAME = "scb_1m_en-th_moses" # SCB_1M-MT_OPUS+TBASE_en-th_moses-spm_130000-16000_v1.0.tar.gz @@ -50,7 +47,7 @@ def _download_install(name: str) -> None: def download_model_all() -> None: """ - Download all translation models in advanced + Download all translation models in advance """ _download_install(_EN_TH_MODEL_NAME) _download_install(_TH_EN_MODEL_NAME) @@ -64,7 +61,7 @@ class EnThTranslator: Website: https://airesearch.in.th/releases/machine-translation-models/ - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__(self, use_gpu: bool = False): @@ -122,7 +119,7 @@ class ThEnTranslator: Website: https://airesearch.in.th/releases/machine-translation-models/ - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__(self, use_gpu: bool = False): diff --git a/pythainlp/translate/small100.py b/pythainlp/translate/small100.py index 9a4fb5bd2..05c9b1e36 100644 --- a/pythainlp/translate/small100.py +++ b/pythainlp/translate/small100.py @@ -3,11 +3,11 @@ class Small100Translator: """ - Machine Translation with small100 model + Machine Translation using small100 model - Huggingface https://huggingface.co/alirezamsh/small100 - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__( diff --git a/pythainlp/translate/th_fr.py b/pythainlp/translate/th_fr.py index a57f74008..da7b67620 100644 --- a/pythainlp/translate/th_fr.py +++ b/pythainlp/translate/th_fr.py @@ -17,7 +17,7 @@ Trained by OPUS Corpus -Model from Language Technology Research Group at the University of Helsinki +Model is from Language Technology Research Group at the University of Helsinki BLEU 20.4 @@ -32,13 +32,13 @@ class ThFrTranslator: Trained by OPUS Corpus - Model from Language Technology Research Group at the University of Helsinki + Model is from Language Technology Research Group at the University of Helsinki BLEU 20.4 - Huggingface https://huggingface.co/Helsinki-NLP/opus-mt-th-fr - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__( diff --git a/pythainlp/translate/tokenization_small100.py b/pythainlp/translate/tokenization_small100.py index 7168076a5..9bb21cfe7 100644 --- a/pythainlp/translate/tokenization_small100.py +++ b/pythainlp/translate/tokenization_small100.py @@ -1,6 +1,6 @@ # Copyright (c) 2022 Idiap Research Institute, http://www.idiap.ch/ # Written by Alireza Mohammadshahi -# This is a modified version of https://github.com/huggingface/transformers/blob/main/src/transformers/models/m2m_100/tokenization_m2m_100.py +# This is a modified version of https://github.com/huggingface/transformers/blob/main/src/transformers/models/m2m_100/tokenization_m2m_100.py # which owns by Fariseq Authors and The HuggingFace Inc. team. # # @@ -25,7 +25,6 @@ import sentencepiece from transformers.tokenization_utils import BatchEncoding, PreTrainedTokenizer -from transformers.utils import logging SPIECE_UNDERLINE = "▁" @@ -329,7 +328,7 @@ def _switch_to_input_mode(self): def _switch_to_target_mode(self): self.prefix_tokens = None - self.suffix_tokens = [self.eos_token_id] + self.suffix_tokens = [self.eos_token_id] def set_lang_special_tokens(self, src_lang: str) -> None: """Reset the special tokens to the tgt lang setting. No prefix and suffix=[eos, tgt_lang_code].""" diff --git a/pythainlp/translate/zh_th.py b/pythainlp/translate/zh_th.py index df8cb1476..c1ce960ae 100644 --- a/pythainlp/translate/zh_th.py +++ b/pythainlp/translate/zh_th.py @@ -32,7 +32,7 @@ class ThZhTranslator: - GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth - Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__( @@ -83,7 +83,7 @@ class ZhThTranslator: - GitHub: https://github.com/LalitaDeelert/lalita-mt-zhth - Facebook post https://web.facebook.com/aibuildersx/posts/166736255494822 - :param bool use_gpu : load model to gpu (Default is False) + :param bool use_gpu : load model using GPU (Default is False) """ def __init__( diff --git a/pythainlp/transliterate/core.py b/pythainlp/transliterate/core.py index 6d375e9cb..abd19b80a 100644 --- a/pythainlp/transliterate/core.py +++ b/pythainlp/transliterate/core.py @@ -160,7 +160,7 @@ def transliterate( if not text or not isinstance(text, str): return "" - if engine == "icu" or engine == "pyicu": + if engine in ("icu", "pyicu"): from pythainlp.transliterate.pyicu import transliterate elif engine == "ipa": from pythainlp.transliterate.ipa import transliterate diff --git a/pythainlp/transliterate/iso_11940.py b/pythainlp/transliterate/iso_11940.py index 2fbb135ce..911aa2914 100644 --- a/pythainlp/transliterate/iso_11940.py +++ b/pythainlp/transliterate/iso_11940.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Transliterating Thai text with ISO 11940 +Transliterating Thai text using ISO 11940 :See Also: * `Wikipedia \ diff --git a/pythainlp/transliterate/thai2rom.py b/pythainlp/transliterate/thai2rom.py index 7ef190661..566221547 100644 --- a/pythainlp/transliterate/thai2rom.py +++ b/pythainlp/transliterate/thai2rom.py @@ -18,7 +18,7 @@ import random import torch -import torch.nn as nn +from torch import nn import torch.nn.functional as F from pythainlp.corpus import get_corpus_path @@ -34,7 +34,7 @@ def __init__(self): Now supports Thai to Latin (romanization) """ - # get the model, will download if it's not available locally + # get the model, download it if it's not available locally self.__model_filename = get_corpus_path(_MODEL_NAME) loader = torch.load(self.__model_filename, map_location=device) @@ -115,7 +115,7 @@ def __init__( self, vocabulary_size, embedding_size, hidden_size, dropout=0.5 ): """Constructor""" - super(Encoder, self).__init__() + super().__init__() self.hidden_size = hidden_size self.character_embedding = nn.Embedding( vocabulary_size, embedding_size @@ -175,7 +175,7 @@ def init_hidden(self, batch_size): class Attn(nn.Module): def __init__(self, method, hidden_size): - super(Attn, self).__init__() + super().__init__() self.method = method self.hidden_size = hidden_size @@ -226,7 +226,7 @@ def __init__( self, vocabulary_size, embedding_size, hidden_size, dropout=0.5 ): """Constructor""" - super(AttentionDecoder, self).__init__() + super().__init__() self.vocabulary_size = vocabulary_size self.hidden_size = hidden_size self.character_embedding = nn.Embedding( @@ -343,7 +343,7 @@ def forward( decoder_input, decoder_hidden, encoder_outputs, mask ) - topv, topi = decoder_output.topk(1) + _, topi = decoder_output.topk(1) outputs[di] = decoder_output.to(device) teacher_force = random.random() < teacher_forcing_ratio diff --git a/pythainlp/transliterate/thai2rom_onnx.py b/pythainlp/transliterate/thai2rom_onnx.py index 94710b9cc..0e934abfc 100644 --- a/pythainlp/transliterate/thai2rom_onnx.py +++ b/pythainlp/transliterate/thai2rom_onnx.py @@ -15,11 +15,11 @@ """ Romanization of Thai words based on machine-learnt engine in ONNX runtime ("thai2rom") """ -from pythainlp.corpus import get_corpus_path -import numpy as np import json - +import numpy as np from onnxruntime import InferenceSession +from pythainlp.corpus import get_corpus_path + _MODEL_ENCODER_NAME = "thai2rom_encoder_onnx" _MODEL_DECODER_NAME = "thai2rom_decoder_onnx" @@ -33,7 +33,7 @@ def __init__(self): Now supports Thai to Latin (romanization) """ - # get the model, will download if it's not available locally + # get the model, download it if it's not available locally self.__encoder_filename = get_corpus_path(_MODEL_ENCODER_NAME) self.__decoder_filename = get_corpus_path(_MODEL_DECODER_NAME) self.__config_filename = get_corpus_path(_MODEL_CONFIG_NAME) diff --git a/pythainlp/transliterate/thaig2p.py b/pythainlp/transliterate/thaig2p.py index ff3515a36..5ea6ad3cd 100644 --- a/pythainlp/transliterate/thaig2p.py +++ b/pythainlp/transliterate/thaig2p.py @@ -21,7 +21,7 @@ import numpy as np import torch -import torch.nn as nn +from torch import nn import torch.nn.functional as F from pythainlp.corpus import get_corpus_path @@ -36,7 +36,7 @@ class ThaiG2P: """ def __init__(self): - # get the model, will download if it's not available locally + # get the model, download it if it's not available locally self.__model_filename = get_corpus_path(_MODEL_NAME) loader = torch.load(self.__model_filename, map_location=device) @@ -118,7 +118,7 @@ def __init__( self, vocabulary_size, embedding_size, hidden_size, dropout=0.5 ): """Constructor""" - super(Encoder, self).__init__() + super().__init__() self.hidden_size = hidden_size self.character_embedding = nn.Embedding( vocabulary_size, embedding_size @@ -182,7 +182,7 @@ def init_hidden(self, batch_size): class Attn(nn.Module): def __init__(self, method, hidden_size): - super(Attn, self).__init__() + super().__init__() self.method = method self.hidden_size = hidden_size @@ -233,7 +233,7 @@ def __init__( self, vocabulary_size, embedding_size, hidden_size, dropout=0.5 ): """Constructor""" - super(AttentionDecoder, self).__init__() + super().__init__() self.vocabulary_size = vocabulary_size self.hidden_size = hidden_size self.character_embedding = nn.Embedding( @@ -350,7 +350,7 @@ def forward( decoder_input, decoder_hidden, encoder_outputs, mask ) - topv, topi = decoder_output.topk(1) + _, topi = decoder_output.topk(1) outputs[di] = decoder_output.to(device) teacher_force = random.random() < teacher_forcing_ratio @@ -371,5 +371,4 @@ def forward( def transliterate(text: str) -> str: - global _THAI_G2P return _THAI_G2P.g2p(text) diff --git a/pythainlp/transliterate/tltk.py b/pythainlp/transliterate/tltk.py index f43889241..763014bee 100644 --- a/pythainlp/transliterate/tltk.py +++ b/pythainlp/transliterate/tltk.py @@ -20,7 +20,7 @@ def romanize(text: str) -> str: """ - Transliterating thai text to the Latin alphabet with tltk. + Transliterating thai text to the Latin alphabet using tltk. :param str text: Thai text to be romanized :return: A string of Thai words rendered in the Latin alphabet. diff --git a/pythainlp/transliterate/w2p.py b/pythainlp/transliterate/w2p.py index 1b0ea03bc..b69f1a754 100644 --- a/pythainlp/transliterate/w2p.py +++ b/pythainlp/transliterate/w2p.py @@ -51,15 +51,15 @@ class _Hparams: def _load_vocab(): g2idx = {g: idx for idx, g in enumerate(hp.graphemes)} - idx2g = {idx: g for idx, g in enumerate(hp.graphemes)} + idx2g = dict(enumerate(hp.graphemes)) p2idx = {p: idx for idx, p in enumerate(hp.phonemes)} - idx2p = {idx: p for idx, p in enumerate(hp.phonemes)} - # note that g and p mean grapheme and phoneme, respectively. + idx2p = dict(enumerate(hp.phonemes)) + # note that g and p mean grapheme and phoneme respectively. return g2idx, idx2g, p2idx, idx2p -class Thai_W2P(object): +class Thai_W2P(): def __init__(self): super().__init__() self.graphemes = hp.graphemes @@ -216,5 +216,4 @@ def pronunciate(text: str) -> str: :return: A string of Thai letters indicating how the input text should be pronounced. """ - global _THAI_W2P return _THAI_W2P(text) diff --git a/pythainlp/transliterate/wunsen.py b/pythainlp/transliterate/wunsen.py index c8c3eb038..373aaef72 100644 --- a/pythainlp/transliterate/wunsen.py +++ b/pythainlp/transliterate/wunsen.py @@ -53,13 +53,13 @@ def transliterate( """ Use Wunsen for transliteration - :param str text: text wants transliterated to Thai text. + :param str text: text to be transliterated to Thai text. :param str lang: source language - :param str jp_input: japanese input method (for japanese only) - :param bool zh_sandhi: mandarin third tone sandhi option - (for mandarin only) - :param str system: transliteration system (for japanese and - mandarin only) + :param str jp_input: Japanese input method (for Japanese only) + :param bool zh_sandhi: Mandarin third tone sandhi option + (for Mandarin only) + :param str system: transliteration system (for Japanese and + Mandarin only) :return: Thai text :rtype: str @@ -134,7 +134,7 @@ def transliterate( self.jp_input = None self.zh_sandhi = zh_sandhi self.system = system - elif lang == "ko" or lang == "vi": + elif lang in ("ko", "vi"): self.jp_input = None self.zh_sandhi = None self.system = None diff --git a/pythainlp/ulmfit/core.py b/pythainlp/ulmfit/core.py index 85d09e044..505d0017e 100644 --- a/pythainlp/ulmfit/core.py +++ b/pythainlp/ulmfit/core.py @@ -46,10 +46,10 @@ # Pretrained model paths -THWIKI_LSTM = dict( - wgts_fname=get_corpus_path(_MODEL_NAME_LSTM), - itos_fname=get_corpus_path(_ITOS_NAME_LSTM), -) +THWIKI_LSTM = { + "wgts_fname": get_corpus_path(_MODEL_NAME_LSTM), + "itos_fname": get_corpus_path(_ITOS_NAME_LSTM), +} # Preprocessing rules for Thai text # dense features @@ -116,7 +116,7 @@ def process_thai( [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', '😃', '😄', '😅', 'pythainlp', '&'] - 2. Modify pre_rules and post_rules arugments with + 2. Modify pre_rules and post_rules arguments with rules provided in :mod:`pythainlp.ulmfit`: >>> from pythainlp.ulmfit import ( @@ -152,7 +152,7 @@ def process_thai( def document_vector(text: str, learn, data, agg: str = "mean"): """ - This function vectorize Thai input text into a 400 dimension vector using + This function vectorizes Thai input text into a 400 dimension vector using :class:`fastai` language model and data bunch. :meth: `document_vector` get document vector using fastai language model @@ -161,7 +161,7 @@ def document_vector(text: str, learn, data, agg: str = "mean"): :param learn: :class:`fastai` language model learner :param data: :class:`fastai` data bunch :param str agg: name of aggregation methods for word embeddings - The avialable methods are "mean" and "sum" + The available methods are "mean" and "sum" :return: :class:`numpy.array` of document vector sized 400 based on the encoder of the model diff --git a/pythainlp/ulmfit/preprocess.py b/pythainlp/ulmfit/preprocess.py index 60974dad0..2d2ce3589 100644 --- a/pythainlp/ulmfit/preprocess.py +++ b/pythainlp/ulmfit/preprocess.py @@ -30,11 +30,11 @@ def replace_url(text: str) -> str: """ - Replace url in `text` with TK_URL + Replace URL in `text` with TK_URL - :param str text: text to replace url + :param str text: text to replace URL in - :return: text where urls are replaced + :return: text with URLs replaced :rtype: str :Example: @@ -49,11 +49,11 @@ def replace_url(text: str) -> str: def fix_html(text: str) -> str: """ - List of replacements from html strings in `test`. (code from `fastai`) + Replace HTML strings in `test`. (codes from `fastai`) - :param str text: text to replace html string + :param str text: text to replace HTML strings in - :return: text where html strings are replaced + :return: text with HTML strings replaced :rtype: str :Example: @@ -83,25 +83,25 @@ def fix_html(text: str) -> str: def rm_useless_spaces(text: str) -> str: - """Remove multiple spaces in `text`. (code from `fastai`)""" + """Remove multiple spaces in `text`. (codes from `fastai`)""" return re.sub(" {2,}", " ", text) def spec_add_spaces(text: str) -> str: - """Add spaces around / and # in `text`. \n (code from `fastai`)""" + """Add spaces around / and # in `text`. \n (codes from `fastai`)""" return re.sub(r"([/#\n])", r" \1 ", text) def replace_rep_after(text: str) -> str: """ - Replace repetitions at the character level in `text` after the repetition. - This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep 8 ย' - ;instead it will retain the word as 'น้อย xxrep 8' + Replace repetitions at the character level in `text` after the repeated character. + This is to prevent cases such as 'น้อยยยยยยยย' becomes 'น้อ xxrep 8 ย' + ; instead it will retain the word as 'น้อย xxrep 8' - :param str text: input text to replace character repetition + :param str text: input text to replace character repetitions in :return: text with repetitive token **xxrep** and the counter - after character repetition + after the repeated character :rtype: str :Example: @@ -124,13 +124,13 @@ def _replace_rep(m): def replace_wrep_post(toks: Collection[str]) -> List[str]: """ - Replace reptitive words post tokenization; + Replace repetitive words after tokenization; fastai `replace_wrep` does not work well with Thai. :param list[str] toks: list of tokens :return: list of tokens where **xxwrep** token and the counter - is added in front of repetitive words. + is added before repetitive words. :rtype: list[str] :Example: @@ -169,11 +169,11 @@ def rm_brackets(text: str) -> str: new_line = re.sub(r"\(\)", "", text) new_line = re.sub(r"\{\}", "", new_line) new_line = re.sub(r"\[\]", "", new_line) - # brakets with only punctuations + # brackets with only punctuation marks new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) - # artifiacts after ( + # artifacts after ( new_line = re.sub( r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line ) diff --git a/pythainlp/ulmfit/tokenizer.py b/pythainlp/ulmfit/tokenizer.py index d738c9d86..184913ce6 100644 --- a/pythainlp/ulmfit/tokenizer.py +++ b/pythainlp/ulmfit/tokenizer.py @@ -21,7 +21,7 @@ class BaseTokenizer: - """Basic class for a tokenizer function. (code from `fastai`)""" + """Basic class for a tokenizer function. (codes from `fastai`)""" def __init__(self, lang: str): self.lang = lang @@ -46,11 +46,11 @@ def __init__(self, lang: str = "th"): @staticmethod def tokenizer(text: str) -> List[str]: """ - This function tokenizes text with *newmm* engine and the dictionary + This function tokenizes text using *newmm* engine and the dictionary specifically for `ulmfit` related functions - (see: `Dictonary file (.txt) \ + (see: `Dictionary file (.txt) \ `_). - :meth: tokenize text with a frozen newmm engine + :meth: tokenize text using a frozen newmm engine :param str text: text to tokenize :return: tokenized text :rtype: list[str] @@ -59,7 +59,7 @@ def tokenizer(text: str) -> List[str]: Using :func:`pythainlp.ulmfit.ThaiTokenizer.tokenizer` is similar to :func:`pythainlp.tokenize.word_tokenize` - with *ulmfit* engine. + using *ulmfit* engine. >>> from pythainlp.ulmfit import ThaiTokenizer >>> from pythainlp.tokenize import word_tokenize diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 8475444c1..59c424126 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -91,7 +91,6 @@ thai_to_eng, ) from pythainlp.util.emojiconv import emoji_to_thai -from pythainlp.util.keyboard import eng_to_thai, thai_to_eng from pythainlp.util.keywords import find_keyword, rank from pythainlp.util.normalize import ( normalize, @@ -125,5 +124,5 @@ ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa from pythainlp.util.encoding import tis620_to_utf8 -import pythainlp.util.spell_words as spell_words +from pythainlp.util import spell_words from pythainlp.util.abbreviation import abbreviation_to_full_text diff --git a/pythainlp/util/abbreviation.py b/pythainlp/util/abbreviation.py index 9e5b1dc18..d64ae7834 100644 --- a/pythainlp/util/abbreviation.py +++ b/pythainlp/util/abbreviation.py @@ -20,14 +20,14 @@ def abbreviation_to_full_text(text: str, top_k: int=2) -> List[Tuple[str, Union[float, None]]]: """ - This function convert Thai text (with abbreviation) to full text. + This function converts Thai text (with abbreviation) to full text. - This function use KhamYo for handles abbreviations. + This function uses KhamYo for handles abbreviations. See more `KhamYo `_. :param str text: Thai text :param int top_k: Top K - :return: Thai full text that handles abbreviations as full text and cos scores (original text - modified text). + :return: Thai full text with abbreviations converted to full text and cos scores (original text - modified text). :rtype: List[Tuple[str, Union[float, None]]] :Example: @@ -48,9 +48,9 @@ def abbreviation_to_full_text(text: str, top_k: int=2) -> List[Tuple[str, Union[ except ImportError: raise ImportError( """ - This funtion need to use khamyo. + This function needs to use khamyo. You can install by pip install khamyo or pip install pythainlp[abbreviation]. """ ) - return _replace(text, top_k=top_k) \ No newline at end of file + return _replace(text, top_k=top_k) diff --git a/pythainlp/util/collate.py b/pythainlp/util/collate.py index 936daef04..80d868e10 100644 --- a/pythainlp/util/collate.py +++ b/pythainlp/util/collate.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Thai collation (sort according to Thai dictionary order) +Thai collation (sorted according to Thai dictionary order) Simple implementation using regular expressions """ import re diff --git a/pythainlp/util/date.py b/pythainlp/util/date.py index 7185d0d2c..018697730 100644 --- a/pythainlp/util/date.py +++ b/pythainlp/util/date.py @@ -15,7 +15,7 @@ """ Thai date/time conversion. -Note: Does not take into account the change of new year's day in Thailand +Note: It does not take into account the change of new year's day in Thailand """ # BE คือ พ.ศ. @@ -133,10 +133,10 @@ def convert_years(year: str, src="be", target="ad") -> str: """ Convert years - :param int year: year - :param str src: The src year + :param int year: Year + :param str src: The source year :param str target: The target year - :return: The years that be convert + :return: The converted year :rtype: str **Options for year** @@ -219,12 +219,12 @@ def thai_strptime( :param str fmt: string containing date and time directives :param str year: year of the text \ (ad isAnno Domini and be is Buddhist calendar) - :param int add_year: add year convert to ad + :param int add_year: add to year when converting to ad :param object tzinfo: tzinfo (default is Asia/Bangkok) - :return: The years that be convert to datetime.datetime + :return: The year that is converted to datetime.datetime :rtype: datetime.datetime - The fmt char that support: + The fmt chars that are supported: * *%d* - Day (1 - 31) * *%B* - Thai month (03, 3, มี.ค., or มีนาคม) * *%Y* - Year (66, 2566, or 2023) @@ -343,7 +343,7 @@ def now_reign_year() -> int: def reign_year_to_ad(reign_year: int, reign: int) -> int: """ - Convert reigh year to AD. + Convert reign year to AD. Return AD year according to the reign year for the 7th to 10th King of Chakri dynasty, Thailand. @@ -385,7 +385,7 @@ def thaiword_to_date( """ Convert Thai relative date to :class:`datetime.datetime`. - :param str text: Thai text contains relative date + :param str text: Thai text containing relative date :param datetime.datetime date: date (default is datetime.datetime.now()) :return: datetime object, if it can be calculated. Otherwise, None. diff --git a/pythainlp/util/digitconv.py b/pythainlp/util/digitconv.py index fe2df8a55..65327f0b6 100644 --- a/pythainlp/util/digitconv.py +++ b/pythainlp/util/digitconv.py @@ -75,11 +75,11 @@ def thai_digit_to_arabic_digit(text: str) -> str: """ - This function convert Thai digits (i.e. ๑, ๓, ๑๐) to Arabic digits + This function converts Thai digits (i.e. ๑, ๓, ๑๐) to Arabic digits (i.e. 1, 3, 10). :param str text: Text with Thai digits such as '๑', '๒', '๓' - :return: Text with Thai digits being converted to Arabic digits + :return: Text with Thai digits converted to Arabic digits such as '1', '2', '3' :rtype: str @@ -101,11 +101,11 @@ def thai_digit_to_arabic_digit(text: str) -> str: def arabic_digit_to_thai_digit(text: str) -> str: """ - This function convert Arabic digits (i.e. 1, 3, 10) to Thai digits + This function converts Arabic digits (i.e. 1, 3, 10) to Thai digits (i.e. ๑, ๓, ๑๐). :param str text: Text with Arabic digits such as '1', '2', '3' - :return: Text with Arabic digits being converted to Thai digits + :return: Text with Arabic digits converted to Thai digits such as '๑', '๒', '๓' :rtype: str @@ -129,12 +129,12 @@ def arabic_digit_to_thai_digit(text: str) -> str: def digit_to_text(text: str) -> str: """ :param str text: Text with digits such as '1', '2', '๓', '๔' - :return: Text with digits being spelled out in Thai + :return: Text with digits spelled out in Thai """ if not text or not isinstance(text, str): return "" - # Convert Thai numerals to Arabic + # Convert Thai numerals to Arabic ones text = text.translate(_thai_arabic_translate_table) # Spell out Arabic numerals in Thai text text = text.translate(_digit_spell_translate_table) @@ -143,11 +143,11 @@ def digit_to_text(text: str) -> str: def text_to_arabic_digit(text: str) -> str: """ - This function convert Thai spelled out digits to Arabic digits. + This function converts spelled out digits in Thai to Arabic digits. :param text: A digit spelled out in Thai :return: An Arabic digit such as '1', '2', '3' if the text is - Thai digit spelled out (ศูนย์, หนึ่ง, สอง, ..., เก้า). + digit spelled out in Thai (ศูนย์, หนึ่ง, สอง, ..., เก้า). Otherwise, it returns an empty string. :rtype: str @@ -165,7 +165,7 @@ def text_to_arabic_digit(text: str) -> str: text_to_arabic_digit("เก้า") # output: 9 - # For text that is not Thai digit spelled out + # For text that is not digit spelled out in Thai text_to_arabic_digit("สิบ") == "" # output: True text_to_arabic_digit("เก้าร้อย") == "" @@ -179,11 +179,11 @@ def text_to_arabic_digit(text: str) -> str: def text_to_thai_digit(text: str) -> str: """ - This function convert Thai spelled out digits to Thai digits. + This function converts spelled out digits in Thai to Thai digits. :param text: A digit spelled out in Thai - :return: A Thai digit such as '๑', '๒', '๓' if the text is Thai digit - spelled out (ศูนย์, หนึ่ง, สอง, ..., เก้า). + :return: A Thai digit such as '๑', '๒', '๓' if the text is digit + spelled out in Thai (ศูนย์, หนึ่ง, สอง, ..., เก้า). Otherwise, it returns an empty string. :rtype: str diff --git a/pythainlp/util/emojiconv.py b/pythainlp/util/emojiconv.py index 198ae5f3e..12af837f1 100644 --- a/pythainlp/util/emojiconv.py +++ b/pythainlp/util/emojiconv.py @@ -1844,10 +1844,10 @@ def emoji_to_thai(text: str, delimiters=(_delimiter, _delimiter)) -> str: """ - This function convert emoji to thai meaning + This function converts emojis to their Thai meanings - :param str text: Text with Emoji - :return: Text with Emoji being converted to thai meaning + :param str text: Text with emojis + :return: Text with emojis converted to their Thai meanings :rtype: str :Example: diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py index aa401c2ad..91f18f411 100644 --- a/pythainlp/util/encoding.py +++ b/pythainlp/util/encoding.py @@ -16,8 +16,8 @@ def tis620_to_utf8(text: str)->str: """ Convert TIS-620 to UTF-8 - :param str text: Text that use TIS-620 encoding - :return: Text that use UTF-8 encoding + :param str text: Text that uses TIS-620 encoding + :return: Text that uses UTF-8 encoding :rtype: str :Example: diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 50a61709c..8a9cbe66e 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -136,9 +136,9 @@ def eng_to_thai(text: str) -> str: Qwerty keyboard layout to the originally intended keyboard layout that is the Thai Kedmanee keyboard. - :param str text: incorrect text input (type Thai with English keyboard) - :return: Thai text where incorrect typing with - a keyboard layout is corrected + :param str text: incorrect text input (Thai typed using English keyboard) + :return: Thai text with typing using + incorrect keyboard layout is corrected :rtype: str :Example: @@ -159,9 +159,9 @@ def thai_to_eng(text: str) -> str: keyboard layout to the originally intended keyboard layout that is the English-US Qwerty keyboard. - :param str text: incorrect text input (type English with Thai keyboard) - :return: English text where incorrect typing with - a keyboard layout is corrected + :param str text: incorrect text input (English typed using Thai keyboard) + :return: English text with typing with + incorrect keyboard layout is corrected :rtype: str :Example: @@ -178,7 +178,7 @@ def thai_to_eng(text: str) -> str: def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: """ - Calculate euclidean distance between two Thai characters + Calculate Euclidean distance between two Thai characters according to their location on a Thai keyboard layout. A modified TIS 820-2531 standard keyboard layout, which is developed @@ -198,7 +198,7 @@ def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: :param str c1: first character :param str c2: second character :param str shift_dist: return value if they're shifted - :return: euclidean distance between two characters + :return: Euclidean distance between two characters :rtype: float :Example: diff --git a/pythainlp/util/keywords.py b/pythainlp/util/keywords.py index 6fda66048..178f8f4b1 100644 --- a/pythainlp/util/keywords.py +++ b/pythainlp/util/keywords.py @@ -22,21 +22,21 @@ def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: """ - Count word frequecy given a list of Thai words with an option + Count word frequencies given a list of Thai words with an option to exclude stopwords. :param list words: a list of words - :param bool exclude_stopwords: If this parameter is set to **True** - to exclude stopwords from counting. + :param bool exclude_stopwords: If this parameter is set to **True**, + exclude stopwords from counting. Otherwise, the stopwords will be counted. By default, `exclude_stopwords`is set to **False** - :return: a Counter object representing word frequency from the text + :return: a Counter object representing word frequencies in the text :rtype: :class:`collections.Counter` :Example: - Include stopwords in counting word frequency:: + Include stopwords when counting word frequencies:: from pythainlp.util import rank @@ -56,7 +56,7 @@ def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: # 'เหตุการณ์': 1 # }) - Exclude stopword in counting word frequency:: + Exclude stopwords when counting word frequencies:: from pythainlp.util import rank @@ -84,13 +84,13 @@ def rank(words: List[str], exclude_stopwords: bool = False) -> Counter: def find_keyword(word_list: List[str], min_len: int = 3) -> Dict[str, int]: """ - This function count the frequency of words in the list - where stopword is excluded and returns as a frequency dictionary. + This function counts the frequencies of words in the list + where stopword is excluded and returns a frequency dictionary. :param list word_list: a list of words - :param int min_len: the mininum frequency for words to obtain + :param int min_len: the minimum frequency for words to be retained - :return: a dictionary object with key-value pair as word and its raw count + :return: a dictionary object with key-value pair being words and their raw counts :rtype: dict[str, int] :Example: diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index ad94c698b..a8cacae22 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -127,12 +127,12 @@ def remove_tonemark(text: str) -> str: * Down tone (Thai: ไม้เอก _่ ) * Falling tone (Thai: ไม้โท _้ ) - * High tone (Thai: ไม้ตรี ​_๊ ) + * High tone (Thai: ไม้ตรี _๊ ) * Rising tone (Thai: ไม้จัตวา _๋ ) Putting wrong tone mark is a common mistake in Thai writing. By removing tone marks from the string, it could be used to - for a approximate string matching + for a approximate string matching. :param str text: input text :return: text without Thai tone marks @@ -238,7 +238,7 @@ def normalize(text: str) -> str: Note: for Unicode normalization, see unicodedata.normalize(). :param str text: input text - :return: normalized text according to the fules + :return: normalized text according to the rules :rtype: str :Example: @@ -268,7 +268,7 @@ def maiyamok(sent: Union[str, List[str]]) -> List[str]: This function is preprocessing MaiYaMok in Thai sentence. :param Union[str, List[str]] sent: input sentence (list or str) - :return: List of words + :return: list of words :rtype: List[str] :Example: diff --git a/pythainlp/util/numtoword.py b/pythainlp/util/numtoword.py index 16fa06b03..798543e4c 100644 --- a/pythainlp/util/numtoword.py +++ b/pythainlp/util/numtoword.py @@ -90,7 +90,7 @@ def bahttext(number: float) -> str: def num_to_thaiword(number: int) -> str: """ - This function convert number to Thai text + This function converts number to Thai text :param int number: an integer number to be converted to Thai text :return: text representing the number in Thai diff --git a/pythainlp/util/phoneme.py b/pythainlp/util/phoneme.py index abe3ca0cd..08d642031 100644 --- a/pythainlp/util/phoneme.py +++ b/pythainlp/util/phoneme.py @@ -90,10 +90,10 @@ def nectec_to_ipa(pronunciation: str) -> str: """ - Converter NECTEC system to IPA system + Convert NECTEC system to IPA system :param str pronunciation: NECTEC phoneme - :return: IPA that be convert + :return: IPA that is converted :rtype: str :Example: @@ -193,12 +193,12 @@ def nectec_to_ipa(pronunciation: str) -> str: def ipa_to_rtgs(ipa: str) -> str: """ - Converter IPA system to The Royal Thai General System of Transcription (RTGS) + Convert IPA system to The Royal Thai General System of Transcription (RTGS) Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai :param str ipa: IPA phoneme - :return: The RTGS that be convert + :return: The RTGS that is converted :rtype: str :Example: @@ -213,9 +213,9 @@ def ipa_to_rtgs(ipa: str) -> str: _temp = [] _list_ipa = ipa_cut.word_tokenize(ipa) for i,p in enumerate(_list_ipa): - if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final.keys()): + if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final): _temp.append(dict_ipa_rtgs_final[p]) - elif p in list(dict_ipa_rtgs.keys()): + elif p in list(dict_ipa_rtgs): _temp.append(dict_ipa_rtgs[p]) else: _temp.append(p) @@ -226,10 +226,10 @@ def ipa_to_rtgs(ipa: str) -> str: def remove_tone_ipa(ipa: str) -> str: """ - Remove Thai Tone from IPA system + Remove Thai Tones from IPA system :param str ipa: IPA phoneme - :return: IPA phoneme that deleted tone + :return: IPA phoneme with tones removed :rtype: str :Example: diff --git a/pythainlp/util/spell_words.py b/pythainlp/util/spell_words.py index 9344df9f4..7c2cc830e 100644 --- a/pythainlp/util/spell_words.py +++ b/pythainlp/util/spell_words.py @@ -27,19 +27,19 @@ from pythainlp.tokenize import subword_tokenize -_r1=["เ-ย","เ-ะ","แ-ะ","โ-ะ","เ-าะ","เ-อะ","เ-อ","เ-า"] -_r2=["–ั:วะ","เ–ี:ยะ","เ–ือะ","–ั:ว","เ–ี:ย","เ–ื:อ","–ื:อ"] -tonemarks={i:"ไม้"+j for i,j in zip(list(thai_tonemarks),["เอก","โท","ตรี","จัตวา"])} +_r1=["เ-ย", "เ-ะ", "แ-ะ", "โ-ะ", "เ-าะ", "เ-อะ", "เ-อ", "เ-า"] +_r2=["–ั:วะ", "เ–ี:ยะ", "เ–ือะ", "–ั:ว", "เ–ี:ย", "เ–ื:อ", "–ื:อ"] +tonemarks={i: "ไม้"+j for i, j in zip(list(thai_tonemarks), ["เอก", "โท", "ตรี", "จัตวา"])} -rule1=[i.replace("-",f"([{thai_letters}](thai_tonemarks)?)") for i in _r1] -rule2=[i.replace("–",f"([{thai_letters}])").replace(":",f"") for i in _r2] -rule3=[i.replace("–",f"([{thai_letters}])").replace(":",f"([{thai_tonemarks}])") for i in _r2] +rule1=[i.replace("-", f"([{thai_letters}](thai_tonemarks)?)") for i in _r1] +rule2=[i.replace("–", f"([{thai_letters}])").replace(":", "") for i in _r2] +rule3=[i.replace("–", f"([{thai_letters}])").replace(":", f"([{thai_tonemarks}])") for i in _r2] dict_vowel_ex={} for i in _r1+_r2: - dict_vowel_ex[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ") + dict_vowel_ex[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ") dict_vowel={} for i in _r1+_r2: - dict_vowel[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ") + dict_vowel[i.replace("-", "อ").replace("–", "อ").replace(":", "")]=i.replace("-", "อ").replace(":", "").replace("–", "อ") for i in thai_lead_vowels: dict_vowel[i]=i+"อ" for i in thai_follow_vowels: @@ -49,33 +49,33 @@ for i in thai_below_vowels: dict_vowel[i]="อ"+i -_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants),engine="mm") +_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants), engine="mm") def _clean(w): - if bool(re.match('|'.join(rule3),w)): + if bool(re.match('|'.join(rule3), w)): for r in rule3: - if bool(re.match(r,w)): - _w=re.sub(r,"\\1==\\2==",w) + if bool(re.match(r, w)): + _w=re.sub(r, "\\1==\\2==", w) _temp=_w.split("==") - w=_temp[0]+r.replace(f"([{thai_letters}])","อ").replace(f"([{thai_tonemarks}])","")+_temp[1] - elif bool(re.match('|'.join(rule2),w)): + w=_temp[0]+r.replace(f"([{thai_letters}])", "อ").replace(f"([{thai_tonemarks}])", "")+_temp[1] + elif bool(re.match('|'.join(rule2), w)): for r in rule2: - if bool(re.match(r,w)): - w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}])","อ") - elif bool(re.match('|'.join(rule1),w)): + if bool(re.match(r, w)): + w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}])", "อ") + elif bool(re.match('|'.join(rule1), w)): for r in rule1: - if bool(re.match(r,w)): - w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)","อ") + if bool(re.match(r, w)): + w=re.sub(r, "\\1", w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)", "อ") return w def spell_syllable(s: str)-> List[str]: """ - Spell syllable by Thai word distribution form. + Spell out syllables in Thai word distribution form. - :param str s: Thai syllable only - :return: List of spell syllable + :param str s: Thai syllables only + :return: List of spelled out syllables :rtype: List[str] :Example: @@ -89,7 +89,7 @@ def spell_syllable(s: str)-> List[str]: _t=s s=_cut.word_tokenize(_clean(s)) _c_only = [i+"อ" for i in s if i in set(thai_consonants)] - _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel.keys())] + _v_only = [dict_vowel[i] for i in s if i in set(dict_vowel)] _t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())] _out=_c_only+_v_only+_t_only _out.append(_t) @@ -98,10 +98,10 @@ def spell_syllable(s: str)-> List[str]: def spell_word(w: str)-> List[str]: """ - Spell word by Thai word distribution form. + Spell out words in Thai word distribution form. - :param str w: Thai word only - :return: List of spell word + :param str w: Thai words only + :return: List of spelled out words :rtype: List[str] :Example: @@ -113,9 +113,9 @@ def spell_word(w: str)-> List[str]: # output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'] """ _r=[] - _temp=subword_tokenize(w,engine="ssg") + _temp=subword_tokenize(w, engine="ssg") for i in _temp: _r.extend(spell_syllable(i)) if len(_temp)>1: _r.append(w) - return _r \ No newline at end of file + return _r diff --git a/pythainlp/util/strftime.py b/pythainlp/util/strftime.py index 197cf68b0..56932a381 100644 --- a/pythainlp/util/strftime.py +++ b/pythainlp/util/strftime.py @@ -68,7 +68,7 @@ def _thai_strftime(dt_obj: datetime, fmt_char: str) -> str: """ Conversion support for thai_strftime(). - The fmt_char should be in _NEED_L10N when call this function. + The fmt_char should be in _NEED_L10N when calling this function. """ str_ = "" if fmt_char == "A": @@ -87,7 +87,7 @@ def _thai_strftime(dt_obj: datetime, fmt_char: str) -> str: # Thai Buddhist century (AD+543)/100 + 1 as decimal number; str_ = str(int((dt_obj.year + _BE_AD_DIFFERENCE) / 100) + 1).zfill(2) elif fmt_char == "c": - # Locale’s appropriate date and time representation + # Locale's appropriate date and time representation # Wed 6 Oct 01:40:00 1976 # พ 6 ต.ค. 01:40:00 2519 <-- left-aligned weekday, right-aligned day str_ = "{:<2} {:>2} {} {} {}".format( @@ -193,13 +193,13 @@ def thai_strftime( * The Thai Buddhist Era (BE) year is simply converted from AD by adding 543. This is certainly not accurate for years before 1941 AD, due to the change in Thai New Year's Day. - * This meant to be an interrim solution, since + * This meant to be an interim solution, since Python standard's locale module (which relied on C's strftime()) does not support "th" or "th_TH" locale yet. If supported, we can just locale.setlocale(locale.LC_TIME, "th_TH") and then use native datetime.strftime(). - We trying to make this platform-independent and support extentions + We are trying to make this platform-independent and support extensions as many as possible. See these links for strftime() extensions in POSIX, BSD, and GNU libc: diff --git a/pythainlp/util/syllable.py b/pythainlp/util/syllable.py index ca894fcc8..d02640f5b 100644 --- a/pythainlp/util/syllable.py +++ b/pythainlp/util/syllable.py @@ -33,7 +33,7 @@ thai_consonants_all.remove("อ") _temp = list( - "".join(["".join(spelling_class[i]) for i in spelling_class.keys()]) + "".join(["".join(v) for v in spelling_class.values()]) ) not_spelling_class = [j for j in thai_consonants_all if j not in _temp] @@ -43,10 +43,10 @@ pattern = re.compile("เ(.*)า", re.U) # เ-า is live syllable _check_1 = [] -# these spelling consonant are live syllable. +# These spelling consonant ares live syllables. for i in ["กง", "กน", "กม", "เกย", "เกอว"]: _check_1.extend(spelling_class[i]) -# these spelling consonant are dead syllable. +# These spelling consonants are dead syllables. _check_2 = spelling_class["กก"] + spelling_class["กบ"] + spelling_class["กด"] thai_low_sonorants = list("งนมยรลว") @@ -73,7 +73,7 @@ def sound_syllable(syllable: str) -> str: Sound syllable classification This function is sound syllable classification. - It is live syllable or dead syllable. + The syllable is a live syllable or dead syllable. :param str syllable: Thai syllable :return: syllable's type (live or dead) @@ -98,20 +98,20 @@ def sound_syllable(syllable: str) -> str: if len(syllable) < 2: return "dead" elif (spelling_consonant in _check_2) and ( - any((c in set("าีืแูาเโ")) for c in syllable) == False - and any((c in set("ำใไ")) for c in syllable) == False - and bool(pattern.search(syllable)) != True + any((c in set("าีืแูาเโ")) for c in syllable) is False + and any((c in set("ำใไ")) for c in syllable) is False + and bool(pattern.search(syllable)) is not True ): return "dead" elif any((c in set("าีืแูาโ")) for c in syllable): # in syllable: if ( spelling_consonant in _check_1 - and bool(re_short.search(syllable)) != True + and bool(re_short.search(syllable)) is not True ): return "live" elif ( spelling_consonant != syllable[-1] - and bool(re_short.search(syllable)) != True + and bool(re_short.search(syllable)) is not True ): return "live" elif spelling_consonant in _check_2: @@ -122,7 +122,7 @@ def sound_syllable(syllable: str) -> str: return "dead" return "live" elif any((c in set("ำใไ")) for c in syllable): - return "live" # if these vowel's long sound are live syllable + return "live" # if these vowel's long sounds are live syllables elif bool(pattern.search(syllable)): # if it is เ-า return "live" elif spelling_consonant in _check_1: @@ -134,7 +134,7 @@ def sound_syllable(syllable: str) -> str: return "live" elif bool( re_short.search(syllable) - ) or any( # if found vowel's short sound + ) or any( # if vowel's short sound is found (c in set(short)) for c in syllable ): # consonant in short return "dead" @@ -144,9 +144,9 @@ def sound_syllable(syllable: str) -> str: def syllable_open_close_detector(syllable: str) -> str: """ - Thai syllable open/close detector + Open/close Thai syllables detector - This function is use for find Thai syllable that open or closed sound. + This function is used for finding Thai syllables that are open or closed sound. :param str syllable: Thai syllable :return: open / close @@ -175,7 +175,7 @@ def syllable_length(syllable: str) -> str: """ Thai syllable length - This function is use for find syllable's length. (long or short) + This function is used for finding syllable's length. (long or short) :param str syllable: Thai syllable :return: syllable's length (long or short) @@ -224,7 +224,7 @@ def tone_detector(syllable: str) -> str: Thai tone detector for syllables :param str syllable: Thai syllable - :return: syllable's tone (l, m, h, r, f or empty if it cannot detector) + :return: syllable's tone (l, m, h, r, f or empty if it cannot be detected) :rtype: str :Example: @@ -244,12 +244,12 @@ def tone_detector(syllable: str) -> str: initial_consonant = consonants[0] tone_mark = _tone_mark_detector(syllable) syllable_check = syllable_open_close_detector(syllable) - syllable_check_lenght = syllable_length(syllable) + syllable_check_length = syllable_length(syllable) initial_consonant_type = thai_initial_consonant_to_type[initial_consonant] # r for store value r = "" if len(consonants) > 1 and ( - initial_consonant == "อ" or initial_consonant == "ห" + initial_consonant in ("อ", "ห") ): consonant_ending = _check_sonorant_syllable(syllable) if ( @@ -281,21 +281,21 @@ def tone_detector(syllable: str) -> str: r = "r" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "short" + and syllable_check_length == "short" and syllable_check == "close" and s == "dead" ): r = "h" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "long" + and syllable_check_length == "long" and syllable_check == "close" and s == "dead" ): r = "f" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "short" + and syllable_check_length == "short" and syllable_check == "open" ): r = "h" diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py index edcb31753..ce0cd992c 100644 --- a/pythainlp/util/thai.py +++ b/pythainlp/util/thai.py @@ -43,7 +43,7 @@ def isthaichar(ch: str) -> bool: :param ch: input character :type ch: str - :return: True if ch is a Thai characttr, otherwise False. + :return: True if ch is a Thai character, otherwise False. :rtype: bool :Example: @@ -58,19 +58,19 @@ def isthaichar(ch: str) -> bool: # output: True """ ch_val = ord(ch) - if ch_val >= _TH_FIRST_CHAR_ASCII and ch_val <= _TH_LAST_CHAR_ASCII: + if _TH_FIRST_CHAR_ASCII <= ch_val <= _TH_LAST_CHAR_ASCII: return True return False def isthai(text: str, ignore_chars: str = ".") -> bool: - """Check if every characters in a string are Thai character. + """Check if every character in a string is a Thai character. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to "." :type ignore_chars: str, optional - :return: True if every characters in the input string are Thai, + :return: True if every character in the input string is Thai, otherwise False. :rtype: bool @@ -106,10 +106,10 @@ def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: :param text: input text :type text: str - :param ignore_chars: characters to be ignored, defaults to whitespaces,\\ - digits, and puntuations. + :param ignore_chars: characters to be ignored, defaults to whitespace,\\ + digits, and punctuation marks. :type ignore_chars: str, optional - :return: proportion of Thai characters in the text (percent) + :return: proportion of Thai characters in the text (percentage) :rtype: float :Example: @@ -185,12 +185,12 @@ def thai_word_tone_detector(word: str) -> Tuple[str, str]: """ Thai tone detector for word. - It use pythainlp.transliterate.pronunciate for convert word to\ + It uses pythainlp.transliterate.pronunciate for converting word to\ pronunciation. :param str word: Thai word. - :return: Thai pronunciation with tone each syllables.\ - (l, m, h, r, f or empty if it cannot detector) + :return: Thai pronunciation with tones in each syllable.\ + (l, m, h, r, f or empty if it cannot be detected) :rtype: Tuple[str, str] :Example: diff --git a/pythainlp/util/thaiwordcheck.py b/pythainlp/util/thaiwordcheck.py index 3230613af..95ff9bc07 100644 --- a/pythainlp/util/thaiwordcheck.py +++ b/pythainlp/util/thaiwordcheck.py @@ -65,7 +65,7 @@ "ผลิ", } -# Diphthong prefixes (can starts native Thai word) +# Diphthong prefixes (can start native Thai word) _TH_PREFIX_DIPHTHONG = {"กะ", "กระ", "ปะ", "ประ"} # Thai consonant filter @@ -76,7 +76,7 @@ def is_native_thai(word: str) -> bool: """ Check if a word is an "native Thai word" (Thai: "คำไทยแท้") - This function based on a simple heuristic algorithm + This function is based on a simple heuristic algorithm and cannot be entirely reliable. :param str word: word @@ -115,11 +115,11 @@ def is_native_thai(word: str) -> bool: if word in _TH_NATIVE_WORDS: return True - # If a word contains non-Thai char, it is not a native Thai + # If a word contains non-Thai chars, it is not a native Thai if any(ch in word for ch in _TH_NON_NATIVE_CHARS): return False - # If does not contain any Thai consonants -> cannot be Thai + # If it does not contain any Thai consonants -> it cannot be Thai chs = re.findall(_TH_CONSONANTS_PATTERN, word) if not chs: return False @@ -133,7 +133,7 @@ def is_native_thai(word: str) -> bool: return True # Note: This will not work, as it check the whole word, not the prefix. - # Prefix-sentitive tokenization is required in order to able to check this. + # Prefix-sensitive tokenization is required in order to be able to check this. if word in _TH_PREFIX_DIPHTHONG: return True diff --git a/pythainlp/util/time.py b/pythainlp/util/time.py index 0203c7df4..dd2a6a030 100644 --- a/pythainlp/util/time.py +++ b/pythainlp/util/time.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Spell out time to Thai words. +Spell out time as Thai words. Convert time string or time object to Thai words. """ @@ -138,11 +138,11 @@ def _format( else: raise NotImplementedError(f"Time format not supported: {fmt}") - if precision == "m" or precision == "s": + if precision in ("m", "s"): if ( m == 30 and (s == 0 or precision == "m") - and (fmt == "6h" or fmt == "m6h") + and (fmt in ("6h", "m6h")) ): text += "ครึ่ง" else: @@ -151,7 +151,7 @@ def _format( text += num_to_thaiword(s) + "วินาที" else: if m: - if m == 30 and s == 0 and (fmt == "6h" or fmt == "m6h"): + if m == 30 and s == 0 and (fmt in ("6h", "m6h")): text += "ครึ่ง" else: text += num_to_thaiword(m) + "นาที" @@ -167,7 +167,7 @@ def time_to_thaiword( precision: Union[str, None] = None, ) -> str: """ - Spell out time to Thai words. + Spell out time as Thai words. :param str time_data: time input, can be a datetime.time object \ or a datetime.datetime object \ @@ -176,11 +176,11 @@ def time_to_thaiword( * *24h* - 24-hour clock (default) * *6h* - 6-hour clock * *m6h* - Modified 6-hour clock - :param str precision: precision of the spell out - * *m* - always spell out to minute level - * *s* - always spell out to second level + :param str precision: precision of the spell out time + * *m* - always spell out at minute level + * *s* - always spell out at second level * None - spell out only non-zero parts - :return: Time spell out in Thai words + :return: Time spelled out as Thai words :rtype: str :Example: @@ -212,12 +212,12 @@ def time_to_thaiword( """ _time = None - if isinstance(time_data, time) or isinstance(time_data, datetime): + if isinstance(time_data, (time, datetime)): _time = time_data else: if not isinstance(time_data, str): raise TypeError( - "Time data must be a datetime.time object, " + "Time input must be a datetime.time object, " "a datetime.datetime object, or a string." ) @@ -247,7 +247,7 @@ def thaiword_to_time(text: str, padding: bool = True) -> str: Convert Thai time in words into time (H:M). :param str text: Thai time in words - :param bool padding: Zero padding the hour if True + :param bool padding: Zero pad the hour if True :return: time string :rtype: str diff --git a/pythainlp/util/trie.py b/pythainlp/util/trie.py index ff9bcfc17..cdc2025f3 100644 --- a/pythainlp/util/trie.py +++ b/pythainlp/util/trie.py @@ -15,13 +15,13 @@ """ Trie data structure. -Designed to use for tokenizer's dictionary, but can be for other purposes. +Designed to be used for tokenizer's dictionary, but can be for other purposes. """ from typing import Iterable, List, Union class Trie: - class Node(object): + class Node(): __slots__ = "end", "children" def __init__(self): diff --git a/pythainlp/util/wordtonum.py b/pythainlp/util/wordtonum.py index ef5fa7139..03524843a 100644 --- a/pythainlp/util/wordtonum.py +++ b/pythainlp/util/wordtonum.py @@ -177,10 +177,10 @@ def words_to_num(words: list) -> float: def text_to_num(text: str) -> List[str]: """ - Thai text to list thai word with floating point number + Thai text to list of Thai words with floating point numbers :param str text: Thai text with the spelled-out numerals - :return: list of thai words with float value of the input + :return: list of Thai words with float values of the input :rtype: List[str] :Example: diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 0abfe2066..f3d31ab20 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -14,11 +14,11 @@ # limitations under the License. from typing import List, Tuple, Union import re +import warnings from transformers import ( CamembertTokenizer, pipeline, ) -import warnings from pythainlp.tokenize import word_tokenize _model_name = "wangchanberta-base-att-spm-uncased" @@ -34,7 +34,7 @@ def __init__( self, dataset_name: str = "thainer", grouped_entities: bool = True ): """ - This function tags named-entitiy from text in IOB format. + This function tags named entities in text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand @@ -66,21 +66,21 @@ def get_ner( self, text: str, pos: bool= False,tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named entities in text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand :param str text: text in Thai to be tagged - :param bool tag: output like html tag. - :return: a list of tuple associated with tokenized word group, NER tag, \ - and output like html tag (if the parameter `tag` is \ + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized word groups, NER tags, \ + and output HTML-like tags (if the parameter `tag` is \ specified as `True`). \ - Otherwise, return a list of tuple associated with tokenized \ - word and NER tag + Otherwise, return a list of tuples associated with tokenized \ + words and NER tags :rtype: Union[list[tuple[str, str]]], str """ if pos: - warnings.warn("This model doesn't support output postag and It doesn't output the postag.") + warnings.warn("This model doesn't support output of POS tags and it doesn't output the POS tags.") text = re.sub(" ", "<_>", text) self.json_ner = self.classify_tokens(text) self.output = "" @@ -141,7 +141,7 @@ def get_ner( class NamedEntityRecognition: def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None: """ - This function tags named-entitiy from text in IOB format. + This function tags named entities in text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand @@ -159,7 +159,7 @@ def _fix_span_error(self, words, ner): i=self.tokenizer.decode(i) if i.isspace() and j.startswith("B-"): j="O" - if i=='' or i=='' or i=='': + if i in ("", "", ""): continue if i=="<_>": i=" " @@ -169,17 +169,17 @@ def get_ner( self, text: str, pos: bool= False,tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named entities in text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand :param str text: text in Thai to be tagged - :param bool tag: output like html tag. - :return: a list of tuple associated with tokenized word group, NER tag, \ - and output like html tag (if the parameter `tag` is \ + :param bool tag: output HTML-like tags. + :return: a list of tuples associated with tokenized word groups, NER tags, \ + and output HTML-like tags (if the parameter `tag` is \ specified as `True`). \ - Otherwise, return a list of tuple associated with tokenized \ - word and NER tag + Otherwise, return a list of tuples associated with tokenized \ + words and NER tags :rtype: Union[list[tuple[str, str]]], str """ import torch diff --git a/pythainlp/word_vector/core.py b/pythainlp/word_vector/core.py index b542da713..2672bf623 100644 --- a/pythainlp/word_vector/core.py +++ b/pythainlp/word_vector/core.py @@ -90,10 +90,10 @@ def doesnt_match(self, words: List[str]) -> str: from :mod:`gensim`. :param list words: a list of words - :raises KeyError: if there is any word in `positive` or `negative` + :raises KeyError: if there is any word in `positive` or `negative` that is not in the vocabulary of the model. - :return: the word that mostly unrelated - :rtype: strt + :return: the word is that mostly unrelated + :rtype: str :Note: * If a word in `words` is not in the vocabulary, :class:`KeyError` @@ -125,8 +125,8 @@ def most_similar_cosmul( self, positive: List[str], negative: List[str] ) -> List[Tuple[str, float]]: """ - This function find the top-10 words that are most similar with respect - to from two lists of words labeled as positive and negative. + This function finds the top-10 words that are most similar with respect + to two lists of words labeled as positive and negative. The top-10 most similar words are obtained using multiplication combination objective from Omer Levy and Yoav Goldberg [OmerLevy_YoavGoldberg_2014]_. @@ -135,12 +135,12 @@ def most_similar_cosmul( :mod:`gensim`. :param list positive: a list of words to add - :param list negative: a list of words to substract + :param list negative: a list of words to subtract - :raises KeyError: if there is any word in `positive` or `negative` + :raises KeyError: if there is any word in `positive` or `negative` that is not in the vocabulary of the model. :return: list of top-10 most similar words and its similarity score - :rtype: list[tuple[str,float]] + :rtype: list[tuple[str, float]] :Note: * With a single word in the positive list, it will find the @@ -210,7 +210,7 @@ def most_similar_cosmul( ('ลาว', 0.2995176911354065), ('คนไทย', 0.2885020673274994), ('เวียดนาม', 0.2878379821777344), ('ชาวไทย', 0.28480708599090576)] - The function return :class:`KeyError` when the term "เมนูอาหารไทย" + The function returns :class:`KeyError` when the term "เมนูอาหารไทย" is not in the vocabulary. >>> from pythainlp.word_vector import WordVector @@ -227,10 +227,10 @@ def most_similar_cosmul( def similarity(self, word1: str, word2: str) -> float: """ - This function computae cosine similarity between two words. + This function computes cosine similarity between two words. - :param str word1: first word to be compared - :param str word2: second word to be compared + :param str word1: first word to be compared with + :param str word2: second word to be compared with :raises KeyError: if either `word1` or `word2` is not in the vocabulary of the model. @@ -266,11 +266,11 @@ def similarity(self, word1: str, word2: str) -> float: def sentence_vectorizer(self, text: str, use_mean: bool = True) -> ndarray: """ - This function convert a Thai sentence into vector. - Specifically, it first tokenize that text and map each tokenized words + This function converts a Thai sentence into vector. + Specifically, it first tokenizes that text and map each tokenized word with the word vectors from the model. - Then, word vectors are aggregatesd into one vector of 300 dimension - by calulating either mean, or summation of all word vectors. + Then, word vectors are aggregated into one vector of 300 dimension + by calculating either mean or summation of all word vectors. :param str text: text input :param bool use_mean: if `True` aggregate word vectors with mean of all @@ -285,8 +285,8 @@ def sentence_vectorizer(self, text: str, use_mean: bool = True) -> ndarray: :Example: Vectorize the sentence, "อ้วนเสี้ยวเข้ายึดแคว้นกิจิ๋ว ในปี พ.ศ. 735", - into one sentence vector with two aggregation meanthods: mean - and summation. + into one sentence vector with two aggregation methods: mean + and summation. >>> from pythainlp.word_vector import WordVector >>> diff --git a/pythainlp/wsd/__init__.py b/pythainlp/wsd/__init__.py index e1ca154b8..9cbf35069 100644 --- a/pythainlp/wsd/__init__.py +++ b/pythainlp/wsd/__init__.py @@ -16,4 +16,4 @@ Thai Word Sense Disambiguation (WSD) """ __all__ = ["get_sense"] -from pythainlp.wsd.core import get_sense \ No newline at end of file +from pythainlp.wsd.core import get_sense diff --git a/pythainlp/wsd/core.py b/pythainlp/wsd/core.py index a2bcaadae..a016b0083 100644 --- a/pythainlp/wsd/core.py +++ b/pythainlp/wsd/core.py @@ -12,13 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import csv from typing import List, Tuple, Union -from pythainlp.corpus import thai_words from pythainlp.tokenize import Tokenizer -from pythainlp.util.trie import Trie, dict_trie -from pythainlp.corpus import get_corpus_path, thai_wsd_dict +from pythainlp.util.trie import Trie +from pythainlp.corpus import thai_wsd_dict _wsd_dict = thai_wsd_dict() _mean_all = {} @@ -51,7 +49,7 @@ def get_score(self, sentences1: str,sentences2: str)->float: def get_sense( sentence: str, word: str, - device:str="cpu", + device: str="cpu", custom_dict: Union[dict,None]=None, custom_tokenizer: Tokenizer=_word_cut, ) -> Union[List[Tuple[str, float]], None]: @@ -61,21 +59,21 @@ def get_sense( :param str sentence: Thai sentence :param str word: Thai word - :param str device: device for running model. + :param str device: device for running model on. :param dict custom_dict: Thai dictionary {"word":["definition",..]} - :param Tokenizer custom_tokenizer: Tokenizer for tokenize words from sentence. - :return: list of definition and distance (1 - cos_sim) or None (If word is not in the dictionary) + :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in sentence. + :return: list of definitions and distances (1 - cos_sim) or None (If word is not in the dictionary) :rtype: Union[List[Tuple[str, float]], None] We get the ideas from `Context-Aware Semantic Similarity Measurement for Unsupervised \ Word Sense Disambiguation `_ to build get_sense function. - For Thai dictionary, We use Thai dictionary from wiktionary. + For Thai dictionary, we use Thai dictionary from wiktionary. See more `thai_dict `_. - For the model, We use Sentence Transformers model from \ + For the model, we use sentence transformers model from \ `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 `_ for \ - Unsupervised Word Sense Disambiguation. + unsupervised word sense disambiguation. :Example: :: @@ -96,12 +94,12 @@ def get_sense( # 0.12473666667938232)] """ global _MODEL - if custom_dict == None: + if custom_dict is None: custom_dict = _mean_all _w = custom_tokenizer.word_tokenize(sentence) if word not in set(custom_dict.keys()) or word not in sentence: return None - if _MODEL == None: + if _MODEL is None: _MODEL = _SentenceTransformersModel(device=device) if _MODEL.device!=device: _MODEL.change_device(device=device) @@ -114,4 +112,4 @@ def get_sense( j = word+f" ({word} ความหมาย '"+i.replace('(',"").replace(')',"")+"') " _temp_2.append(j) _temp.append((i,_MODEL.get_score(sentence,''.join(_temp_2)))) - return _temp \ No newline at end of file + return _temp diff --git a/setup.py b/setup.py index 30e7dd832..30ae09c05 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ pip install --upgrade --pre pythainlp ``` -Some functionalities, like named-entity recognition, required extra packages. +Some functionalities, like named-entity recognition, require extra packages. See https://github.com/PyThaiNLP/pythainlp for installation options. """ @@ -228,4 +228,4 @@ }, ) -# TODO: Check extras and decide to download additional data, like model files +# TODO: Check extras and decide whether or not additional data, like model files, should be downloaded diff --git a/tests/test_ancient.py b/tests/test_ancient.py index 23761f501..d63218442 100644 --- a/tests/test_ancient.py +++ b/tests/test_ancient.py @@ -5,16 +5,16 @@ class TestAncientPackage(unittest.TestCase): def test_aksonhan_to_current(self): - self.assertEquals(aksonhan_to_current("ก"), 'ก') - self.assertEquals(aksonhan_to_current("กก"), 'กก') - self.assertEquals(aksonhan_to_current("ถนน"), 'ถนน') - self.assertEquals(aksonhan_to_current("จกก"), 'จัก') - self.assertEquals(aksonhan_to_current("ดง่ง"), 'ดั่ง') - self.assertEquals(aksonhan_to_current("นน้น"), 'นั้น') - self.assertEquals(aksonhan_to_current("ขดด"), 'ขัด') - self.assertEquals(aksonhan_to_current("ตรสส"), 'ตรัส') - self.assertEquals(aksonhan_to_current("ขบบ"), 'ขับ') - self.assertEquals(aksonhan_to_current("วนน"), 'วัน') - self.assertEquals(aksonhan_to_current("หลงง"), 'หลัง') - self.assertEquals(aksonhan_to_current("บงงคบบ"), 'บังคับ') - self.assertEquals(aksonhan_to_current("สรรเพชญ"), 'สรรเพชญ') + self.assertEqual(aksonhan_to_current("ก"), 'ก') + self.assertEqual(aksonhan_to_current("กก"), 'กก') + self.assertEqual(aksonhan_to_current("ถนน"), 'ถนน') + self.assertEqual(aksonhan_to_current("จกก"), 'จัก') + self.assertEqual(aksonhan_to_current("ดง่ง"), 'ดั่ง') + self.assertEqual(aksonhan_to_current("นน้น"), 'นั้น') + self.assertEqual(aksonhan_to_current("ขดด"), 'ขัด') + self.assertEqual(aksonhan_to_current("ตรสส"), 'ตรัส') + self.assertEqual(aksonhan_to_current("ขบบ"), 'ขับ') + self.assertEqual(aksonhan_to_current("วนน"), 'วัน') + self.assertEqual(aksonhan_to_current("หลงง"), 'หลัง') + self.assertEqual(aksonhan_to_current("บงงคบบ"), 'บังคับ') + self.assertEqual(aksonhan_to_current("สรรเพชญ"), 'สรรเพชญ') diff --git a/tests/test_augment.py b/tests/test_augment.py index f88b76d64..51dc89082 100644 --- a/tests/test_augment.py +++ b/tests/test_augment.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import unittest +import nltk from pythainlp.augment import WordNetAug from pythainlp.augment.wordnet import postype2wordnet # from pythainlp.augment.lm import Thai2transformersAug @@ -8,8 +9,6 @@ from pythainlp.augment.word2vec import ( LTW2VAug ) -# Thai2fitAug, -import nltk class TestTextaugmentPackage(unittest.TestCase): diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index be6e33dde..85a67ebd1 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -59,8 +59,8 @@ def test_count_correctly_tokenised_words(self): sample = np.array(list(d["actual"])).astype(int) ref_sample = np.array(list(d["expected"])).astype(int) - sb = list(word_tokenization._find_word_boudaries(sample)) - rb = list(word_tokenization._find_word_boudaries(ref_sample)) + sb = list(word_tokenization._find_word_boundaries(sample)) + rb = list(word_tokenization._find_word_boundaries(ref_sample)) # in binary [{0, 1}, ...] correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised( @@ -81,7 +81,7 @@ def test_words_correctly_tokenised(self): self.assertEqual(expected, "".join(np.array(labels).astype(str))) def test_flatten_result(self): - result = dict(key1=dict(v1=6), key2=dict(v2=7)) + result = {"key1": {"v1": 6}, "key2": {"v2": 7}} actual = word_tokenization._flatten_result(result) self.assertEqual(actual, {"key1:v1": 6, "key2:v2": 7}) diff --git a/tests/test_coref.py b/tests/test_coref.py index 09ebea704..7cf641131 100644 --- a/tests/test_coref.py +++ b/tests/test_coref.py @@ -11,4 +11,4 @@ def test_coreference_resolution(self): # coreference_resolution( # "Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก" # ) - # ) \ No newline at end of file + # ) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index bcead7626..133371e14 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -1,8 +1,11 @@ # -*- coding: utf-8 -*- +import os import unittest +import nltk from nltk.corpus import wordnet as wn +from requests import Response from pythainlp.corpus import ( conceptnet, countries, @@ -27,9 +30,6 @@ wordnet, ) from pythainlp.corpus.util import revise_newmm_default_wordset -from requests import Response -import nltk -import os class TestCorpusPackage(unittest.TestCase): diff --git a/tests/test_khavee.py b/tests/test_khavee.py index f2cbd2e0a..92c3452a7 100644 --- a/tests/test_khavee.py +++ b/tests/test_khavee.py @@ -15,7 +15,7 @@ def test_check_marttra(self): def test_is_sumpus(self): self.assertTrue(kv.is_sumpus('สรร','อัน')) self.assertFalse(kv.is_sumpus('สรร','แมว')) - + def test_check_klon(self): self.assertEqual( kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4), @@ -23,12 +23,11 @@ def test_check_klon(self): ) self.assertEqual( kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4), - ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] + ["Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2"] ) def test_check_aek_too(self): self.assertEqual(kv.check_aek_too('ไกด์'), False) self.assertEqual(kv.check_aek_too('ไก่'), 'aek') self.assertEqual(kv.check_aek_too('ไก้'), 'too') - self.assert_(kv.check_aek_too(['หนม', 'หน่ม', 'หน้ม']), [False, 'aek', 'too']) - + self.assertTrue(kv.check_aek_too(['หนม', 'หน่ม', 'หน้ม']), [False, 'aek', 'too']) diff --git a/tests/test_misspell.py b/tests/test_misspell.py index bebf6e3a2..f7889113b 100644 --- a/tests/test_misspell.py +++ b/tests/test_misspell.py @@ -43,7 +43,7 @@ def test_misspell_with_ratio_0_percent(self): self.assertEqual( diff, 0, - "we shouldn't have any misspell with ratio=0." + "we shouldn't have any misspell with ratio=0." ) def test_misspell_with_ratio_50_percent(self): diff --git a/tests/test_summarize.py b/tests/test_summarize.py index ef79298a3..a1b919953 100644 --- a/tests/test_summarize.py +++ b/tests/test_summarize.py @@ -55,7 +55,7 @@ def test_keyword_extraction(self): # test invalid engine with self.assertRaises(ValueError): extract_keywords(text, engine="random engine") - + # test different tokenizer keywords = extract_keywords(text, tokenizer="attacut") diff --git a/tests/test_tag.py b/tests/test_tag.py index f8172446d..eae51bf30 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -1,9 +1,7 @@ # -*- coding: utf-8 -*- -from pythainlp import corpus import unittest from os import path -from pythainlp import tag from pythainlp.tag import ( chunk_parse, @@ -212,7 +210,7 @@ def test_ner(self): ) ) - # arguement `tag` is True + # argument `tag` is True self.assertIsNotNone( ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True) ) @@ -239,7 +237,7 @@ def test_ner(self): ) ) - # arguement `tag` is True + # argument `tag` is True self.assertEqual( ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True), "วันที่ 15 ก.ย. 61 " @@ -297,7 +295,7 @@ def test_ner(self): ner.get_ner("บางแสนกรุงเทพ", pos=False, tag=True) ) - # arguement `tag` is False and `pos` is True + # argument `tag` is False and `pos` is True self.assertEqual( ner.get_ner("ไทย", pos=True, tag=False), [('ไทย', 'PROPN', 'B-LOCATION')], diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 750186328..c224a675c 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -774,7 +774,7 @@ def test_tcc_p(self): self.assertEqual( tcc_p.segment("เรือน้อยลอยอยู่"), ['เรือ', 'น้', 'อ', 'ย', 'ล', 'อ', 'ย', 'อ', 'ยู่'] ) - # Not implementation + # Not implemented # self.assertEqual( # tcc.segment("ประสานงานกับลูกค้า"), ['ป', 'ระ', 'สา', 'น', 'งา', 'น', 'กั', 'บ', 'ลู', 'ก', 'ค้า'] # ) @@ -848,7 +848,7 @@ def test_numeric_data_format(self): "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine ) self.assertTrue( - any([value in tokens for value in ["12:12pm", "12:12"]]), + any(value in tokens for value in ["12:12pm", "12:12"]), msg=f"{engine}: {tokens}", ) self.assertIn("11.11", tokens) diff --git a/tests/test_translate.py b/tests/test_translate.py index 5581ca16b..d7f94f1cf 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -2,18 +2,12 @@ import unittest -from pythainlp.translate import ( - ThZhTranslator, - ZhThTranslator, - Translate -) +from pythainlp.translate import Translate from pythainlp.translate.en_th import ( EnThTranslator, ThEnTranslator, download_model_all ) -from pythainlp.corpus import remove - class TestTranslatePackage(unittest.TestCase): def test_translate(self): diff --git a/tests/test_transliterate.py b/tests/test_transliterate.py index 283c7dd33..130d18668 100644 --- a/tests/test_transliterate.py +++ b/tests/test_transliterate.py @@ -62,8 +62,7 @@ def test_romanize(self): self.assertEqual(romanize("แมว", engine="tltk"), "maeo") def test_romanize_royin_basic(self): - for word in _BASIC_TESTS: - expect = _BASIC_TESTS[word] + for word, expect in _BASIC_TESTS.items(): self.assertEqual(romanize(word, engine="royin"), expect) def test_romanize_royin_consistency(self): diff --git a/tests/test_ulmfit.py b/tests/test_ulmfit.py index 164107e94..0abd403e9 100644 --- a/tests/test_ulmfit.py +++ b/tests/test_ulmfit.py @@ -1,7 +1,13 @@ # -*- coding: utf-8 -*- +import pickle import unittest +import pandas as pd +import torch +# fastai +import fastai +from fastai.text import * from pythainlp.tokenize import THAI2FIT_TOKENIZER from pythainlp.ulmfit import ( THWIKI_LSTM, @@ -30,12 +36,6 @@ ungroup_emoji, ) from pythainlp.ulmfit.tokenizer import BaseTokenizer as base_tokenizer -import pandas as pd -import pickle -import torch -# fastai -import fastai -from fastai.text import * # pythainlp from pythainlp.ulmfit import * @@ -243,21 +243,21 @@ def test_document_vector(self): .databunch(bs=64) ) data_lm.sanity_check() - config = dict( - emb_sz=400, - n_hid=1550, - n_layers=4, - pad_token=1, - qrnn=False, - tie_weights=True, - out_bias=True, - output_p=0.25, - hidden_p=0.1, - input_p=0.2, - embed_p=0.02, - weight_p=0.15 - ) - trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) + config = { + "emb_sz": 400, + "n_hid": 1550, + "n_layers": 4, + "pad_token": 1, + "qrnn": False, + "tie_weights": True, + "out_bias": True, + "output_p": 0.25, + "hidden_p": 0.1, + "input_p": 0.2, + "embed_p": 0.02, + "weight_p": 0.15 + } + trn_args = {"drop_mult": 0.9, "clip": 0.12, "alpha": 2, "beta": 1} learn = language_model_learner( data_lm, AWD_LSTM, diff --git a/tests/test_util.py b/tests/test_util.py index e31c8bdd2..065f2082c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -430,7 +430,7 @@ def test_thaiword_to_date(self): self.assertIsNotNone(thaiword_to_date("วันนี้")) - # it's error if "พรุ่งนี้" is 1 not 32. + # it's an error if "พรุ่งนี้" is 1 not 32. # self.assertEqual( # thaiword_to_date("วันนี้").day + 1, # thaiword_to_date("พรุ่งนี้").day, @@ -513,10 +513,10 @@ def test_normalize(self): self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") - # remove epeating tone marks + # remove repeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") - # remove repeating different ton emarks + # remove repeating different tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual( normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" @@ -605,7 +605,7 @@ def test_countthai(self): self.assertEqual(countthai("(กกต.)", None), 50.0) def test_count_thai_chars(self): - self.assertEquals( + self.assertEqual( count_thai_chars("ทดสอบภาษาไทย"), { 'vowels': 3, @@ -621,7 +621,7 @@ def test_count_thai_chars(self): 'non_thai': 0, } ) - self.assertEquals( + self.assertEqual( count_thai_chars("มี ๕ บาทไหม๏ เกมส์หรือเกมกันแน่ที่กรุเทพฯ ใช้"), { 'vowels': 12, @@ -852,6 +852,6 @@ def test_spell_word(self): self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ']) self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน']) self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']) - + # def test_abbreviation_to_full_text(self): # self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list)) diff --git a/tests/test_wangchanberta.py b/tests/test_wangchanberta.py index c29e5cfb4..09dca9822 100644 --- a/tests/test_wangchanberta.py +++ b/tests/test_wangchanberta.py @@ -34,4 +34,3 @@ def test_segment_wangchanberta(self): self.assertIsNotNone( segment([]) ) - diff --git a/tokenization-benchmark.md b/tokenization-benchmark.md index 42ae22df0..8d08b15a4 100644 --- a/tokenization-benchmark.md +++ b/tokenization-benchmark.md @@ -1,7 +1,7 @@ -# Word Tokenisation Benchmark for Thai (obsolete) +# Word Tokenization Benchmark for Thai (obsolete) -A framework for benchmarking tokenisation algorithms for Thai. -It has a command-line interface that allows users to conviniently execute the benchmarks +A framework for benchmarking tokenization algorithms for Thai. +It has a command-line interface that allows users to conveniently execute the benchmarks as well as a module interface for later use in their development pipelines. @@ -22,7 +22,7 @@ as well as a module interface for later use in their development pipelines. ### Word-Level (WL) -- Correctly Tokenised Words (CTW): no. of words in reference that are correctly tokenised. +- Correctly Tokenized Words (CTW): no. of words in reference that are correctly tokenized. - Precision: CTW / no. words in reference solution - Recall: CTW / no. words in sample -**** f1: ... @@ -89,7 +89,7 @@ pip ... ``` ## Related Work -- [Thai Tokenisers Docker][docker]: collection of pre-built Thai tokenisers Docker containers. +- [Thai Tokenizers Docker][docker]: collection of Docker containers of pre-built Thai tokenizers. ## Development @@ -99,7 +99,7 @@ $ TEST_VERBOSE=1 PYTHONPATH=. python tests/__init__.py ``` ## Acknowledgement -This project was initiallly started by [Pattarawat Chormai][pat], while he was interning at [Dr. Attapol Thamrongrattanarit][ate]'s lab. +This project was initially started by [Pattarawat Chormai][pat], while he was interning at [Dr. Attapol Thamrongrattanarit][ate]'s lab. [docker]: https://github.com/PyThaiNLP/docker-thai-tokenizers [ate]: https://attapol.github.io From af7ad9a872f43c31a27135a8afe476528be6ec25 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 03:37:25 +0530 Subject: [PATCH 10/33] Update benchmarks.rst In the updated documentation for the pythainlp.benchmarks module, several improvements have been introduced to enhance clarity and comprehensibility. The primary objective was to provide a comprehensive introduction to the module, emphasizing its purpose and the services it offers. Notable changes include: Introduction: The documentation now starts with a clear introduction to the pythainlp.benchmarks module, highlighting its role in benchmarking Thai NLP tasks. Users can easily grasp the module's intended use and its focus on evaluating NLP tasks in the Thai language. Tokenization: The "Tokenization" section has been elaborated to stress the importance of word tokenization in NLP and its relevance to various applications. Users are now more informed about the significance of benchmarking tokenization methods and why this module is a valuable resource. Quality Evaluation: An entirely new subsection has been added to introduce the concept of quality evaluation in word tokenization. This section emphasizes the impact of tokenization quality on downstream NLP tasks and the necessity of assessment. A visual representation of the evaluation process has been included for better visualization. Functions: Each benchmarking function, including compute_stats, benchmark, and preprocessing, has been given a brief description. Users can now quickly understand the purpose of each function and how they can be used in practice. Usage: The "Usage" section now encourages users to refer to the official PyThaiNLP documentation for examples and guidelines on utilizing the benchmarking functions. This provides users with clear guidance on how to get started with benchmarking word tokenization in their projects. --- docs/api/benchmarks.rst | 45 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docs/api/benchmarks.rst b/docs/api/benchmarks.rst index 418e53b6f..69093ca7d 100644 --- a/docs/api/benchmarks.rst +++ b/docs/api/benchmarks.rst @@ -14,11 +14,56 @@ Tokenization Quality ^^^^ +.. figure:: ../images/evaluation.png + :scale: 50 % + +Qualitative evaluation of word tokenization. + +.. autofunction:: pythainlp.benchmarks.word_tokenization.compute_stats +.. autofunction:: pythainlp.benchmarks.word_tokenization.benchmark +.. autofunction:: pythainlp.benchmarks.word_tokenization.preprocessing + +.. currentmodule:: pythainlp.benchmarks + +pythainlp.benchmarks Module +=========================== + +Introduction +------------ + +The `pythainlp.benchmarks` module is a collection of utility functions designed for benchmarking tasks related to Thai Natural Language Processing (NLP). Currently, the module includes tools for word tokenization benchmarking. Please note that additional benchmarking tasks will be incorporated in the future. + +Tokenization +------------ + +Word tokenization is a fundamental task in NLP, and it plays a crucial role in various applications, such as text analysis and language processing. The `pythainlp.benchmarks` module offers a set of functions to assist in the benchmarking and evaluation of word tokenization methods. + +Quality Evaluation +^^^^^^^^^^^^^^^^^^ + +The quality of word tokenization can significantly impact the accuracy of downstream NLP tasks. To assess the quality of word tokenization, the module provides a qualitative evaluation using various metrics and techniques. + .. figure:: ../images/evaluation.png :scale: 50 % Qualitative evaluation of word tokenization. +Functions +--------- + .. autofunction:: pythainlp.benchmarks.word_tokenization.compute_stats + + This function is used to compute various statistics and metrics related to word tokenization. It allows you to assess the performance of different tokenization methods. + .. autofunction:: pythainlp.benchmarks.word_tokenization.benchmark + + The `benchmark` function facilitates the benchmarking of word tokenization methods. It provides an organized framework for evaluating and comparing the effectiveness of different tokenization tools. + .. autofunction:: pythainlp.benchmarks.word_tokenization.preprocessing + + Preprocessing is a crucial step in NLP tasks. The `preprocessing` function assists in preparing text data for tokenization, which is essential for accurate and consistent benchmarking. + +Usage +----- + +To make use of these benchmarking functions, you can follow the provided examples and guidelines in the official PyThaiNLP documentation. These tools are invaluable for researchers, developers, and anyone interested in improving and evaluating Thai word tokenization methods. From 2a21760e13a51d9dedc74fe723fd2855a3b2fc6c Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 04:53:41 +0530 Subject: [PATCH 11/33] Update benchmarks.rst In the updated documentation for the pythainlp.benchmarks module, several improvements have been introduced to enhance clarity and comprehensibility. The primary objective was to provide a comprehensive introduction to the module, emphasizing its purpose and the services it offers. Notable changes include: Introduction: The documentation now starts with a clear introduction to the pythainlp.benchmarks module, highlighting its role in benchmarking Thai NLP tasks. Users can easily grasp the module's intended use and its focus on evaluating NLP tasks in the Thai language. Tokenization: The "Tokenization" section has been elaborated to stress the importance of word tokenization in NLP and its relevance to various applications. Users are now more informed about the significance of benchmarking tokenization methods and why this module is a valuable resource. Quality Evaluation: An entirely new subsection has been added to introduce the concept of quality evaluation in word tokenization. This section emphasizes the impact of tokenization quality on downstream NLP tasks and the necessity of assessment. A visual representation of the evaluation process has been included for better visualization. Functions: Each benchmarking function, including compute_stats, benchmark, and preprocessing, has been given a brief description. Users can now quickly understand the purpose of each function and how they can be used in practice. --- docs/api/benchmarks.rst | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/docs/api/benchmarks.rst b/docs/api/benchmarks.rst index 69093ca7d..bf9e6047a 100644 --- a/docs/api/benchmarks.rst +++ b/docs/api/benchmarks.rst @@ -2,31 +2,6 @@ pythainlp.benchmarks ==================================== -The :class:`pythainlp.benchmarks` contains utility functions for benchmarking -tasked related to Thai NLP. At the moment, we have only for word tokenization. -Other tasks will be added soon. - -Modules -------- - -Tokenization -********* - -Quality -^^^^ -.. figure:: ../images/evaluation.png - :scale: 50 % - -Qualitative evaluation of word tokenization. - -.. autofunction:: pythainlp.benchmarks.word_tokenization.compute_stats -.. autofunction:: pythainlp.benchmarks.word_tokenization.benchmark -.. autofunction:: pythainlp.benchmarks.word_tokenization.preprocessing - -.. currentmodule:: pythainlp.benchmarks - -pythainlp.benchmarks Module -=========================== Introduction ------------ From 669230452808c6fcfd5ba415b4910d4785107d41 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 05:06:12 +0530 Subject: [PATCH 12/33] Update augment.rst The enhanced documentation for the pythainlp.augment module brings about several notable improvements. These changes focus on providing users with a more comprehensive understanding of the module and its various components for text augmentation in the Thai language. Here's an overview of the key changes: Introduction: The documentation now starts with a clear introduction, emphasizing the importance of text augmentation in NLP and its specific relevance to the Thai language. This introduction sets the stage for the entire module, making it clear why text augmentation is a crucial task. TextAugment Class: The central TextAugment class is highlighted, and its purpose as the core component of the module is explained. Users can now understand that this class serves as the gateway to various text augmentation techniques. Class Details: Each class within the module, such as WordNetAug, Word2VecAug, FastTextAug, and BPEmbAug, is provided with a detailed description of its purpose and capabilities. This clarity allows users to determine which class is best suited for their specific text augmentation needs. Function Descriptions: The postype2wordnet function's role in mapping part-of-speech tags to WordNet-compatible POS tags is clearly explained, facilitating the integration of WordNet augmentation with Thai text. Users can better understand how to work with this function in their text augmentation tasks. Usage Guidance: The documentation emphasizes that users can refer to the official PyThaiNLP documentation for detailed usage examples and guidelines. This encourages users to explore the module's full potential for enriching and diversifying Thai text data and improving NLP models and applications. These changes make the documentation more informative and accessible, making it easier for researchers, developers, and practitioners to understand how to leverage the pythainlp.augment module effectively. With this enhanced documentation, users can confidently harness the power of text augmentation for Thai language NLP tasks. --- docs/api/augment.rst | 72 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/docs/api/augment.rst b/docs/api/augment.rst index 220cc21c8..bff34aed3 100644 --- a/docs/api/augment.rst +++ b/docs/api/augment.rst @@ -1,25 +1,69 @@ .. currentmodule:: pythainlp.augment -pythainlp.augment -================= +pythainlp.augment Module +======================= -The :class:`textaugment` is Thai text augment. This function for text augment task. +Introduction +------------ -Modules -------- +The `pythainlp.augment` module is a powerful toolset for text augmentation in the Thai language. Text augmentation is a process that enriches and diversifies textual data by generating alternative versions of the original text. This module is a valuable resource for improving the quality and variety of Thai language data for NLP tasks. + +TextAugment Class +----------------- + +The central component of the `pythainlp.augment` module is the `TextAugment` class. This class provides various text augmentation techniques and functions to enhance the diversity of your text data. It offers the following methods: + +.. autoclass:: pythainlp.augment.TextAugment + :members: + +WordNetAug Class +---------------- + +The `WordNetAug` class is designed to perform text augmentation using WordNet, a lexical database for English. This class enables you to augment Thai text using English synonyms, offering a unique approach to text diversification. The following methods are available within this class: + +.. autoclass:: pythainlp.augment.WordNetAug + :members: + +Word2VecAug, Thai2fitAug, LTW2VAug Classes +------------------------------------------ + +The `pythainlp.augment.word2vec` package contains multiple classes for text augmentation using Word2Vec models. These classes include `Word2VecAug`, `Thai2fitAug`, and `LTW2VAug`. Each of these classes allows you to use Word2Vec embeddings to generate text variations. Explore the methods provided by these classes to understand their capabilities. -.. autoclass:: WordNetAug - :members: -.. autofunction:: postype2wordnet .. autoclass:: pythainlp.augment.word2vec.Word2VecAug - :members: + :members: + .. autoclass:: pythainlp.augment.word2vec.Thai2fitAug - :members: + :members: + .. autoclass:: pythainlp.augment.word2vec.LTW2VAug - :members: + :members: + +FastTextAug and Thai2transformersAug Classes +-------------------------------------------- + +The `pythainlp.augment.lm` package offers classes for text augmentation using language models. These classes include `FastTextAug` and `Thai2transformersAug`. These classes allow you to use language model-based techniques to diversify text data. Explore their methods to understand their capabilities. + .. autoclass:: pythainlp.augment.lm.FastTextAug - :members: + :members: + .. autoclass:: pythainlp.augment.lm.Thai2transformersAug - :members: + :members: + +BPEmbAug Class +-------------- + +The `pythainlp.augment.word2vec.bpemb_wv` package contains the `BPEmbAug` class, which is designed for text augmentation using subword embeddings. This class is particularly useful when working with subword representations for Thai text augmentation. + .. autoclass:: pythainlp.augment.word2vec.bpemb_wv.BPEmbAug - :members: \ No newline at end of file + :members: + +Additional Functions +------------------- + +To further enhance your text augmentation tasks, the `pythainlp.augment` module offers the following functions: + +- `postype2wordnet`: This function maps part-of-speech tags to WordNet-compatible POS tags, facilitating the integration of WordNet augmentation with Thai text. + +These functions and classes provide diverse techniques for text augmentation in the Thai language, making this module a valuable asset for NLP researchers, developers, and practitioners. + +For detailed usage examples and guidelines, please refer to the official PyThaiNLP documentation. The `pythainlp.augment` module opens up new possibilities for enriching and diversifying Thai text data, leading to improved NLP models and applications. From ba322308e1de0bb6e69130995a9613c425ec452d Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 05:38:54 +0530 Subject: [PATCH 13/33] Update coref.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The updated documentation for the pythainlp.coref module aims to provide a more comprehensive understanding of its purpose and utility for coreference resolution in the Thai language. Here are the key changes and their significance: Introduction: The introduction now explicitly mentions that the module is dedicated to coreference resolution for Thai, clarifying its specific purpose. This addition ensures that users quickly grasp the module's specialization and its role in addressing coreference challenges in Thai text. Coreference Resolution Function: The core of the module, the coreference_resolution function, is introduced and explained in detail. Users are informed about the task it performs – identifying expressions referring to the same entities in text. This clarity is essential for users to understand the central function of the module. Usage: The usage section provides a step-by-step guide on how to use the coreference_resolution function effectively. It includes an example to illustrate the process, making it more user-friendly. This practical guidance empowers users to start using the module immediately in their NLP tasks. Conclusion: The conclusion reiterates the module's significance, emphasizing its role in enhancing NLP systems' understanding of Thai text. It encourages users to explore the official PyThaiNLP documentation for more details. This promotes continued learning and utilization of the module's capabilities. --- docs/api/coref.rst | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/api/coref.rst b/docs/api/coref.rst index daf5690bc..9a786364e 100644 --- a/docs/api/coref.rst +++ b/docs/api/coref.rst @@ -2,9 +2,37 @@ pythainlp.coref =============== -The :class:`pythainlp.coref` is Coreference Resolution for Thai. +Introduction +------------ + +The `pythainlp.coref` module is dedicated to Coreference Resolution for the Thai language. Coreference resolution is a crucial task in natural language processing (NLP) that deals with identifying and linking expressions (such as pronouns) in a text to the entities or concepts they refer to. This module provides tools to tackle coreference resolution challenges in the context of the Thai language. -Modules -------- +Coreference Resolution Function +------------------------------- + +The primary component of the `pythainlp.coref` module is the `coreference_resolution` function. This function is designed to analyze text and identify instances of coreference, helping NLP systems understand when different expressions in the text refer to the same entity. Here's how you can use it: + +The :class:`pythainlp.coref` is Coreference Resolution for Thai. .. autofunction:: coreference_resolution + +Usage +----- + +To use the `coreference_resolution` function effectively, follow these steps: + +1. Import the `coreference_resolution` function from the `pythainlp.coref` module. + +2. Pass the Thai text you want to analyze for coreferences as input to the function. + +3. The function will process the text and return information about coreference relationships within the text. + +Example: + +```python +from pythainlp.coref import coreference_resolution + +text = "นาย A มาจาก กรุงเทพ และเขา มีความรักต่อ บางกิจ ของเขา" +coreferences = coreference_resolution(text) + +print(coreferences) From 6cff37d60bd02e3c260f3a41a382e5493834b427 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 05:49:17 +0530 Subject: [PATCH 14/33] Update corpus.rst In this enhanced documentation for the pythainlp.corpus module, several improvements have been made to enhance its clarity and usefulness for users. Here's an extended description of the changes: Introduction and Purpose: The documentation begins with a concise introduction, highlighting the purpose of the pythainlp.corpus module. It clarifies that this module provides access to Thai language corpora and resources that come bundled with PyThaiNLP. This sets the stage for users, making it clear what to expect. Modules: Each module in the pythainlp.corpus package is described more thoroughly. The functions within each module are listed, and the :noindex: directive is used to suppress automatic indexing. This simplifies navigation and makes it easier for users to find the information they need. ConceptNet: A brief description of ConceptNet is provided, along with a link to the ConceptNet documentation. Users are directed to external resources for more in-depth information, making the documentation more informative. TNC (Thai National Corpus) and TTC (Thai Textbook Corpus): These two corpus modules have been explained more clearly. Users can now understand that they provide access to word frequency data and the source of the data. OSCAR: The OSCAR module is introduced as a multilingual corpus with access to word frequency data. Users can better understand its purpose and utility. Util: The "Util" section now explicitly states that it contains utilities for working with corpus data, providing context for its functions. WordNet: The WordNet section now mentions that it's an exact copy of NLTK's WordNet API and includes a link to the NLTK WordNet documentation. This helps users understand its origin and where to find more extensive information. Definition of "Synset": A definition of "Synset" has been added, clarifying its meaning as a set of synonyms with a common meaning. This is a critical term for understanding WordNet functionality. Overall Structure: The documentation maintains a consistent structure with clear headings and subheadings, making it easy for users to navigate and find the specific information they need. These changes are designed to make the documentation more user-friendly and informative. Users can now gain a better understanding of the purpose of each module and how to use them effectively. Additionally, by including references to external resources and clarifying key terms, users can access more in-depth information when needed. --- docs/api/corpus.rst | 206 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 198 insertions(+), 8 deletions(-) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index b68ffacc3..6c5dbf72c 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -2,90 +2,280 @@ pythainlp.corpus ==================================== -The :class:`pythainlp.corpus` provides access to corpus that comes with PyThaiNLP. +The :class:`pythainlp.corpus` module provides access to various Thai language corpora and resources that come bundled with PyThaiNLP. These resources are essential for natural language processing tasks in the Thai language. Modules ------- +countries +~~~~~~~~~~ .. autofunction:: countries + :noindex: + +get_corpus +~~~~~~~~~~ .. autofunction:: get_corpus + :noindex: + +get_corpus_db +~~~~~~~~~~~~~~ .. autofunction:: get_corpus_db + :noindex: + +get_corpus_db_detail +~~~~~~~~~~~~~~~~~~~~ .. autofunction:: get_corpus_db_detail + :noindex: + +get_corpus_default_db +~~~~~~~~~~~~~~~~~~~~ .. autofunction:: get_corpus_default_db + :noindex: + +get_corpus_path +~~~~~~~~~~~~~~ .. autofunction:: get_corpus_path + :noindex: + +download +~~~~~~~~~~ .. autofunction:: download + :noindex: + +remove +~~~~~~~ .. autofunction:: remove + :noindex: + +provinces +~~~~~~~~~~ .. autofunction:: provinces + :noindex: + +thai_dict +~~~~~~~~~~ .. autofunction:: thai_dict + :noindex: + +thai_stopwords +~~~~~~~~~~~~~~ .. autofunction:: thai_stopwords + :noindex: + +thai_words +~~~~~~~~~~ .. autofunction:: thai_words + :noindex: + +thai_wsd_dict +~~~~~~~~~~~~~~ .. autofunction:: thai_wsd_dict + :noindex: + +thai_orst_words +~~~~~~~~~~~~~~~~~ .. autofunction:: thai_orst_words + :noindex: + +thai_synonym +~~~~~~~~~~~~~~ .. autofunction:: thai_synonym + :noindex: + +thai_syllables +~~~~~~~~~~~~~~ .. autofunction:: thai_syllables + :noindex: + +thai_negations +~~~~~~~~~~~~~~ .. autofunction:: thai_negations + :noindex: + +thai_family_names +~~~~~~~~~~~~~~~~~~~ .. autofunction:: thai_family_names + :noindex: + +thai_female_names +~~~~~~~~~~~~~~~~~~~ .. autofunction:: thai_female_names + :noindex: + +thai_male_names +~~~~~~~~~~~~~~~~ .. autofunction:: thai_male_names + :noindex: + +pythainlp.corpus.th_en_translit.get_transliteration_dict +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.th_en_translit.get_transliteration_dict + :noindex: ConceptNet ---------- -ConceptNet is an open, multilingual knowledge graph -See: https://github.com/commonsense/conceptnet5/wiki/API +ConceptNet is an open, multilingual knowledge graph used for various natural language understanding tasks. For more information, refer to the `ConceptNet documentation `_. +pythainlp.corpus.conceptnet.edges +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.conceptnet.edges + :noindex: -TNC +TNC (Thai National Corpus) --- +The Thai National Corpus (TNC) is a collection of text data in the Thai language. This module provides access to word frequency data from the TNC corpus. + +pythainlp.corpus.tnc.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.tnc.word_freqs + :noindex: + +pythainlp.corpus.tnc.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs + :noindex: + +pythainlp.corpus.tnc.bigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs + :noindex: + +pythainlp.corpus.tnc.trigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs + :noindex: -TTC +TTC (Thai Textbook Corpus) --- +The Thai Textbook Corpus (TTC) is a collection of Thai language text data, primarily sourced from textbooks. + +pythainlp.corpus.ttc.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.ttc.word_freqs + :noindex: + +pythainlp.corpus.ttc.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs + :noindex: OSCAR ----- +OSCAR is a multilingual corpus that includes Thai text data. This module provides access to word frequency data from the OSCAR corpus. + +pythainlp.corpus.oscar.word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.oscar.word_freqs + :noindex: + +pythainlp.corpus.oscar.unigram_word_freqs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs + :noindex: Util ---- +Utilities for working with the corpus data. + +pythainlp.corpus.util.find_badwords +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.util.find_badwords + :noindex: + +pythainlp.corpus.util.revise_wordset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.util.revise_wordset + :noindex: + +pythainlp.corpus.util.revise_newmm_default_wordset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.util.revise_newmm_default_wordset + :noindex: WordNet ------- -PyThaiNLP API is an exact copy of NLTK WordNet API. -See: https://www.nltk.org/howto/wordnet.html +PyThaiNLP API includes the WordNet module, which is an exact copy of NLTK's WordNet API for the Thai language. WordNet is a lexical database for English and other languages. + +For more details on WordNet, refer to the `NLTK WordNet documentation `_. +pythainlp.corpus.wordnet.synsets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.synsets + :noindex: + +pythainlp.corpus.wordnet.synset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.synset + :noindex: + +pythainlp.corpus.wordnet.all_lemma_names +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.all_lemma_names + :noindex: + +pythainlp.corpus.wordnet.all_synsets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.all_synsets + :noindex: + +pythainlp.corpus.wordnet.langs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.langs + :noindex: + +pythainlp.corpus.wordnet.lemmas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.lemmas + :noindex: + +pythainlp.corpus.wordnet.lemma +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.lemma + :noindex: + +pythainlp.corpus.wordnet.lemma_from_key +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.lemma_from_key + :noindex: + +pythainlp.corpus.wordnet.path_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.path_similarity + :noindex: + +pythainlp.corpus.wordnet.lch_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.lch_similarity + :noindex: + +pythainlp.corpus.wordnet.wup_similarity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.wup_similarity + :noindex: + +pythainlp.corpus.wordnet.morphy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.morphy + :noindex: + +pythainlp.corpus.wordnet.custom_lemmas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.corpus.wordnet.custom_lemmas + :noindex: Definition ++++++++++ Synset - a set of synonyms that share a common meaning. +~~~~~~~ +A synset is a set of synonyms that share a common meaning. The WordNet module provides functionality to work with these synsets. + +This documentation is designed to help you navigate and use the various resources and modules available in the `pythainlp.corpus` package effectively. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. + +We hope you find this documentation helpful for your natural language processing tasks in the Thai language. From 63305c3b7b036b15c1d482ffc023dcc0715ee2f8 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 05:57:26 +0530 Subject: [PATCH 15/33] Update el.rst Certainly, here's an extended description of the changes made in the code documentation: **Introduction and Purpose**: - The documentation for the `pythainlp.el` module has been significantly enhanced to provide a clear and concise introduction. It now explicitly states that this module is related to Thai Entity Linking within PyThaiNLP. This sets the context for users, ensuring they understand the module's core purpose. **EntityLinker Class Explanation**: - The `EntityLinker` class is introduced as the central component of the module. It is responsible for Thai Entity Linking, which is further explained as a vital natural language processing task. Users can now grasp the significance of this module and its role in various NLP applications. **Attributes and Methods**: - A comprehensive list of attributes and methods offered by the `EntityLinker` class is provided. Each attribute and method is explained briefly, making it clear to users how to interact with the class effectively. **Usage Guidelines**: - The documentation includes a "Usage" section that outlines a step-by-step guide for users on how to use the `EntityLinker` class. This section simplifies the process and helps users understand the expected workflow. **Example**: - A practical usage example is included, demonstrating how to initialize an `EntityLinker` object, perform entity linking, and access the linked entities. This example serves as a reference for users to apply the module in their own projects. **Overall Clarity and Structure**: - The documentation maintains a consistent and organized structure with clear headings, subheadings, and bullet points. This ensures that users can easily navigate and find the information they need. These changes are aimed at making the documentation more informative and user-friendly. By providing a detailed explanation of the module's purpose, attributes, methods, usage guidelines, and a practical example, users can gain a better understanding of how to leverage the `pythainlp.el` module effectively in their natural language processing tasks. --- docs/api/el.rst | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/docs/api/el.rst b/docs/api/el.rst index bd88abc15..36d24d1bf 100644 --- a/docs/api/el.rst +++ b/docs/api/el.rst @@ -2,7 +2,53 @@ pythainlp.el ============ -The :class:`pythainlp.el` is Thai Entity Linking with PyThaiNLP. +The :class:`pythainlp.el` module is an essential component of Thai Entity Linking within the PyThaiNLP library. Entity Linking is a key natural language processing task that associates mentions in text with corresponding entities in a knowledge base. .. autoclass:: EntityLinker :members: + +EntityLinker +------------ + +The :class:`EntityLinker` class is the core component of the `pythainlp.el` module, responsible for Thai Entity Linking. Entity Linking, also known as Named Entity Linking (NEL), plays a critical role in various applications, including question answering, information retrieval, and knowledge graph construction. + +Attributes and Methods +~~~~~~~~~~~~~~~~~~~~~~ + +The `EntityLinker` class offers the following attributes and methods: + +- `__init__(text, engine="default")` + - The constructor for the `EntityLinker` class. It takes the input `text` and an optional `engine` parameter to specify the entity linking engine. The default engine is used if no specific engine is provided. + +- `link()` + - The `link` method performs entity linking on the input text using the specified engine. It returns a list of entities linked in the text, along with their relevant information. + +- `set_engine(engine)` + - The `set_engine` method allows you to change the entity linking engine during runtime. This provides flexibility in selecting different engines for entity linking based on your specific requirements. + +- `get_linked_entities()` + - The `get_linked_entities` method retrieves a list of linked entities from the last entity linking operation. This is useful for extracting the entities found in the text. + +Usage +~~~~~ + +To use the `EntityLinker` class for entity linking, follow these steps: + +1. Initialize an `EntityLinker` object with the input text and, optionally, specify the engine. + +2. Call the `link` method to perform entity linking on the text. + +3. Utilize the `get_linked_entities` method to access the linked entities found in the text. + +Example +~~~~~~~ + +Here's a simple example of how to use the `EntityLinker` class: + +```python +from pythainlp.el import EntityLinker + +text = "Bangkok is the capital of Thailand." +el = EntityLinker(text) +linked_entities = el.link() +print(linked_entities) From d8926a020b7bc268a8e2a37b7ce6fc7a3f7ee987 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 06:06:02 +0530 Subject: [PATCH 16/33] Update generate.rst Introduction and Purpose: The documentation for the pythainlp.generate module has been improved to offer a more explicit introduction. It now clearly defines the purpose of this module, emphasizing its role in Thai text generation within PyThaiNLP. This ensures that users have a solid understanding of what this module is designed for. Individual Class and Function Explanations: Each class and function within the module is explained in detail. The purpose and usage of the Unigram, Bigram, and Trigram classes, as well as the pythainlp.generate.thai2fit.gen_sentence function, and the WangChanGLM class, are highlighted. Users can now understand which language models they can use and how to choose the right one for their text generation needs. Usage Guidelines: A new "Usage" section is included, outlining clear steps for users on how to make use of the text generation capabilities offered by the module. These steps simplify the process and provide a structured approach to generating text. Example: A practical usage example is provided, demonstrating how to generate text using the Unigram class. This example gives users a reference point for applying the module in their own projects, making it more accessible. Overall Structure and Clarity: The documentation maintains a consistent structure with clear headings, subheadings, and bullet points, enhancing its readability and ease of navigation. --- docs/api/generate.rst | 64 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/docs/api/generate.rst b/docs/api/generate.rst index 910bba27d..d0c80580a 100644 --- a/docs/api/generate.rst +++ b/docs/api/generate.rst @@ -2,17 +2,71 @@ pythainlp.generate ================== -The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP. +The :class:`pythainlp.generate` module is a powerful tool for generating Thai text using PyThaiNLP. It includes several classes and functions that enable users to create text based on various language models and n-gram models. Modules ------- +Unigram +~~~~~~~ .. autoclass:: Unigram - :members: + :members: + +The :class:`Unigram` class provides functionality for generating text based on unigram language models. Unigrams are single words or tokens, and this class allows you to create text by selecting words probabilistically based on their frequencies in the training data. + +Bigram +~~~~~~ .. autoclass:: Bigram - :members: + :members: + +The :class:`Bigram` class is designed for generating text using bigram language models. Bigrams are sequences of two words, and this class enables you to generate text by predicting the next word based on the previous word's probability. + +Trigram +~~~~~~~ .. autoclass:: Trigram - :members: + :members: + +The :class:`Trigram` class extends text generation to trigram language models. Trigrams consist of three consecutive words, and this class facilitates the creation of text by predicting the next word based on the two preceding words' probabilities. + +pythainlp.generate.thai2fit.gen_sentence +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.generate.thai2fit.gen_sentence + :noindex: + +The function :func:`pythainlp.generate.thai2fit.gen_sentence` offers a convenient way to generate sentences using the Thai2Vec language model. It takes a seed text as input and generates a coherent sentence based on the provided context. + +pythainlp.generate.wangchanglm.WangChanGLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: pythainlp.generate.wangchanglm.WangChanGLM - :members: \ No newline at end of file + :members: + +The :class:`WangChanGLM` class is a part of the `pythainlp.generate.wangchanglm` module, offering text generation capabilities. It includes methods for creating text using the WangChanGLM language model. + +Usage +~~~~~ + +To use the text generation capabilities provided by the `pythainlp.generate` module, follow these steps: + +1. Select the appropriate class or function based on the type of language model you want to use (Unigram, Bigram, Trigram, Thai2Vec, or WangChanGLM). + +2. Initialize the selected class or use the function with the necessary parameters. + +3. Call the appropriate methods to generate text based on the chosen model. + +4. Utilize the generated text for various applications, such as chatbots, content generation, and more. + +Example +~~~~~~~ + +Here's a simple example of how to generate text using the `Unigram` class: + +```python +from pythainlp.generate import Unigram + +# Initialize the Unigram model +unigram = Unigram() + +# Generate a sentence +sentence = unigram.gen_sentence(seed="สวัสดีครับ") + +print(sentence) From 49c1d06ed52a0e036541cff5bf86839958a7a844 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 06:14:10 +0530 Subject: [PATCH 17/33] Update khavee.rst Introduction and Purpose: The documentation for the pythainlp.khavee module has been significantly enhanced with a clear and informative introduction. It explicitly defines the module's purpose and its connection to Thai poetry, using the Thai term "khavee" to provide a cultural context. KhaveeVerifier Class Explanation: The KhaveeVerifier class is introduced as the core component of the pythainlp.khavee module, dedicated to Thai poetry verification. Its role in analyzing and validating Thai poetry is highlighted, and its significance in ensuring adherence to classical Thai poetic forms is emphasized. Attributes and Methods: The documentation provides a detailed description of the attributes and methods offered by the KhaveeVerifier class. This includes the constructor, is_khavee method for verification, and utility methods for inspecting and setting custom rules. Users can now understand how to interact with this class effectively. Usage Guidelines: The newly added "Usage" section outlines a step-by-step approach for users on how to use the KhaveeVerifier class for Thai poetry verification. This structured guidance simplifies the process and ensures users know how to get started. Example: A practical usage example is included, illustrating how to verify Thai poetry using the KhaveeVerifier class. This example serves as a reference for users, allowing them to see how the toolkit can be applied in real-world scenarios. Cultural Context: The use of the Thai term "khavee" and the mention of Thai poetry connect the toolkit to the cultural and linguistic context of Thailand. This adds depth to the documentation, making it not only informative but culturally relevant. Overall Structure and Clarity: The documentation maintains a consistent structure with clear headings, subheadings, and bullet points. This structured approach enhances readability and ease of navigation. --- docs/api/khavee.rst | 53 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/docs/api/khavee.rst b/docs/api/khavee.rst index 71983bcd1..591ec79fd 100644 --- a/docs/api/khavee.rst +++ b/docs/api/khavee.rst @@ -2,11 +2,62 @@ pythainlp.khavee ================ -The :class:`pythainlp.khavee` is toolkit for Thai Poetry. `khavee` is `กวี` (or Poetry) in Thai language. +The :class:`pythainlp.khavee` module is a powerful toolkit designed for working with Thai poetry. The term "khavee" corresponds to "กวี" in the Thai language, which translates to "Poetry" in English. This toolkit equips users with the tools and utilities necessary for the creation, analysis, and verification of Thai poetry. Modules ------- +KhaveeVerifier +~~~~~~~~~~~~~~ .. autoclass:: KhaveeVerifier :special-members: :members: + +The :class:`KhaveeVerifier` class is the primary component of the `pythainlp.khavee` module, dedicated to the verification of Thai poetry. It offers a range of functions and methods for analyzing and validating Thai poetry, ensuring its adherence to the rules and structure of classical Thai poetic forms. + +Attributes and Methods +~~~~~~~~~~~~~~~~~~~~~~ + +The `KhaveeVerifier` class provides a variety of attributes and methods to facilitate the verification of Thai poetry. Some of its key features include: + +- `__init__(rules: dict = None, stanza_rules: dict = None, verbose: bool = False)` + - The constructor for the `KhaveeVerifier` class, allowing you to initialize an instance with custom rules, stanza rules, and verbosity settings. + +- `is_khavee(text: str, rules: dict = None)` + - The `is_khavee` method checks whether a given text conforms to the rules of Thai poetry. It returns `True` if the text is a valid Thai poem according to the specified rules, and `False` otherwise. + +- `get_rules()` + - The `get_rules` method retrieves the current set of rules being used by the verifier. This is helpful for inspecting and modifying the rules during runtime. + +- `set_rules(rules: dict)` + - The `set_rules` method allows you to set custom rules for the verifier, offering flexibility in defining specific constraints for Thai poetry. + +Usage +~~~~~ + +To use the `KhaveeVerifier` class for Thai poetry verification, follow these steps: + +1. Initialize an instance of the `KhaveeVerifier` class, optionally specifying custom rules and verbosity settings. + +2. Use the `is_khavee` method to verify whether a given text adheres to the rules of Thai poetry. The method returns a Boolean value indicating the result. + +3. Utilize the `get_rules` and `set_rules` methods to inspect and modify the rules as needed. + +Example +~~~~~~~ + +Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry: + +```python +from pythainlp.khavee import KhaveeVerifier + +# Initialize a KhaveeVerifier instance +verifier = KhaveeVerifier() + +# Text to verify +poem_text = "ดอกไม้สวยงาม แสนสดใส" + +# Verify if the text is Thai poetry +is_poetry = verifier.is_khavee(poem_text) + +print(f"The provided text is Thai poetry: {is_poetry}") From ff8e3717e705bdb227170ca952ada3643f9ada97 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:35:11 +0530 Subject: [PATCH 18/33] Update parse.rst Introduction and Purpose: The documentation for the pythainlp.parse module has been enhanced to offer a more explicit introduction. It now clearly defines the module's purpose, emphasizing its role in providing dependency parsing for the Thai language. This is vital for users to understand the core functionality of the module. Dependency Parsing Explanation: Dependency parsing, a fundamental task in natural language processing, has been explained in the introduction. Users are now aware that dependency parsing involves identifying grammatical relationships between words in a sentence to analyze sentence structure and meaning. dependency_parsing Function: The dependency_parsing function is introduced as the central component of the pythainlp.parse module. It is described as the core function for dependency parsing in Thai. This helps users understand which function to use for this specific task. Usage Guidelines: The documentation now includes a "Usage" section outlining clear steps for users on how to use the dependency_parsing function for Thai dependency parsing. These structured guidelines simplify the process and ensure that users know how to get started. Example: A practical usage example is provided, demonstrating how to use the dependency_parsing function to parse a Thai sentence. This example serves as a reference for users, allowing them to see how the function can be applied in real-world scenarios. --- docs/api/parse.rst | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/api/parse.rst b/docs/api/parse.rst index db1ea47b6..93bb4d552 100644 --- a/docs/api/parse.rst +++ b/docs/api/parse.rst @@ -2,9 +2,39 @@ pythainlp.parse =============== -The :class:`pythainlp.parse` is dependency parsing for Thai. +The :class:`pythainlp.parse` module provides dependency parsing for the Thai language. Dependency parsing is a fundamental task in natural language processing that involves identifying the grammatical relationships between words in a sentence, which helps to analyze sentence structure and meaning. Modules ------- +dependency_parsing +~~~~~~~~~~~~~~~~~ .. autofunction:: dependency_parsing + +The `dependency_parsing` function is the core component of the `pythainlp.parse` module. It offers dependency parsing capabilities for the Thai language. Given a Thai sentence as input, this function parses the sentence to identify the grammatical relationships between words, creating a dependency tree that represents the sentence's structure. + +Usage +~~~~~ + +To use the `dependency_parsing` function for Thai dependency parsing, follow these steps: + +1. Import the `pythainlp.parse` module. +2. Use the `dependency_parsing` function with a Thai sentence as input. +3. The function will return the dependency parsing results, which include information about the grammatical relationships between words. + +Example +~~~~~~~ + +Here's a basic example of how to use the `dependency_parsing` function: + +```python +from pythainlp.parse import dependency_parsing + +# Input Thai sentence +sentence = "พี่น้องชาวบ้านกำลังเลี้ยงสตางค์ในสวน" + +# Perform dependency parsing +parsing_result = dependency_parsing(sentence) + +# Print the parsing result +print(parsing_result) From 281a978d79a6e2e2e388bf12a138e7ea9c8798f7 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:44:41 +0530 Subject: [PATCH 19/33] Update soundex.rst Introduction and Purpose: The documentation for the pythainlp.soundex module has been significantly improved. It now provides a clear and detailed introduction, explaining that this module offers soundex algorithms for the Thai language. It emphasizes the importance of soundex for phonetic matching tasks, such as name matching and search. Module Descriptions: All modules within the pythainlp.soundex module have been described in detail. Users can now understand the purpose and specific functionalities of each module, such as basic Soundex, the Udompanich Soundex algorithm, novel phonetic name matching, and cross-language transliterated word retrieval. References: The documentation now includes a "References" section, providing citations and links to relevant academic papers and sources. These references add credibility to the module and allow users to explore further if they are interested in the underlying research and development. These changes are aimed at making the documentation more informative and user-friendly. By providing clear module descriptions and academic references, users can now better comprehend the capabilities and applications of the pythainlp.soundex module for phonetic matching in the Thai language. --- docs/api/soundex.rst | 54 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst index 139fadd02..66ae95e07 100644 --- a/docs/api/soundex.rst +++ b/docs/api/soundex.rst @@ -1,31 +1,69 @@ .. currentmodule:: pythainlp.soundex pythainlp.soundex -==================================== -The :class:`pythainlp.soundex` is soundex for Thai. +================ +The :class:`pythainlp.soundex` module provides soundex algorithms for the Thai language. Soundex is a phonetic algorithm used to encode words or names into a standardized representation based on their pronunciation, making it useful for tasks like name matching and search. Modules ------- +soundex +~~~~~~~ .. autofunction:: soundex + +The `soundex` function is a basic Soundex algorithm for the Thai language. It encodes a Thai word into a Soundex code, allowing for approximate matching of words with similar pronunciation. + +lk82 +~~~~ .. autofunction:: lk82 + +The `lk82` module implements the Thai Soundex algorithm proposed by Vichit Lorjai in 1982. This module is suitable for encoding Thai words into Soundex codes for phonetic comparisons. + +udom83 +~~~~~~ .. autofunction:: udom83 + +The `udom83` module is based on a homonymic approach for sound-alike string search. It encodes Thai words using the Udompanich Soundex algorithm developed in 1983. + +metasound +~~~~~~~~~ .. autofunction:: metasound + +The `metasound` module implements a novel phonetic name matching algorithm with a statistical ontology for analyzing names based on Thai astrology. It offers advanced phonetic matching capabilities for Thai names. + +prayut_and_somchaip +~~~~~~~~~~~~~~~~~~~ .. autofunction:: prayut_and_somchaip + +The `prayut_and_somchaip` module is designed for Thai-English cross-language transliterated word retrieval using the Soundex technique. It is particularly useful for matching transliterated words in both languages. + +pythainlp.soundex.sound.word_approximation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.soundex.sound.word_approximation + +The `pythainlp.soundex.sound.word_approximation` module offers word approximation functionality. It allows users to find Thai words that are phonetically similar to a given word. + +pythainlp.soundex.sound.audio_vector +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.soundex.sound.audio_vector + +The `pythainlp.soundex.sound.audio_vector` module provides audio vector functionality for Thai words. It allows users to work with audio vectors based on phonetic properties. + +pythainlp.soundex.sound.word2audio +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: pythainlp.soundex.sound.word2audio +The `pythainlp.soundex.sound.word2audio` module is designed for converting Thai words to audio representations. It enables users to obtain audio vectors for Thai words, which can be used for various applications. + References ---------- +.. [#metasound] Snae & Brückner. (2009). `Novel Phonetic Name Matching Algorithm with a Statistical Ontology for Analyzing Names Given in Accordance with Thai Astrology `_. -.. [#metasound] Snae & Brückner. (2009). `Novel Phonetic Name Matching Algorithm with a Statistical - Ontology for Analysing Names Given in Accordance with Thai Astrology `_. - -.. [#udom83] Wannee Udompanich (1983). Search Thai sound-alike string using homonymic approach. - Master Thesis. Chulalongkorn University, Thailand. +.. [#udom83] Wannee Udompanich (1983). Search Thai sound-alike string using homonymic approach. Master Thesis. Chulalongkorn University, Thailand. .. [#lk82] วิชิต หล่อจีระชุณห์กุล และ เจริญ คุวินทร์พันธุ์. `โปรแกรมการสืบค้นคำไทยตามเสียงอ่าน (Thai Soundex) `_. -.. [#prayut_and_somchaip] Prayut Suwanvisat, Somchai Prasitjutrakul. Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf +.. [#prayut_and_somchaip] Prayut Suwanvisat, Somchai Prasitjutrakul. Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf. + +This enhanced documentation provides clear descriptions of all the modules within the `pythainlp.soundex` module, including their purposes and functionalities. Users can now better understand how to leverage these soundex algorithms for various phonetic matching tasks in the Thai language. From 28f8a7ad2828f2db14a239c995daf85567a07237 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:50:02 +0530 Subject: [PATCH 20/33] Update spell.rst Introduction and Purpose: The documentation for the pythainlp.spell module has undergone significant improvements. It now provides a more explicit and detailed introduction, emphasizing the module's importance in enhancing text accuracy through spelling correction. Users are made aware that it offers a range of functionalities for spell-checking and correction in the Thai language. Function Descriptions: Each function within the module is described in detail, outlining its specific purpose and how it can be used. Users can now understand the functionalities of correct, correct_sent, spell, and spell_sent in both single-word and sentence-level contexts. NorvigSpellChecker Class: The NorvigSpellChecker class is introduced as a core component of the pythainlp.spell module. Users can now understand its significance in implementing spell-checking algorithms and its potential for advanced spell-checking with customizable settings. DEFAULT_SPELL_CHECKER: The DEFAULT_SPELL_CHECKER instance, pre-configured with the standard NorvigSpellChecker settings and Thai National Corpus data, is presented. Users can grasp the idea of a reliable default spell-checking configuration for common use cases. References: The documentation now includes a "References" section, providing a citation and a link to Peter Norvig's influential work on spelling correction. This adds credibility and gives users the option to explore the academic source for more in-depth understanding. --- docs/api/spell.rst | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/docs/api/spell.rst b/docs/api/spell.rst index cad3f7faf..c28fca95e 100644 --- a/docs/api/spell.rst +++ b/docs/api/spell.rst @@ -1,23 +1,54 @@ .. currentmodule:: pythainlp.spell pythainlp.spell -===================================== -The :class:`pythainlp.spell` finds the closest correctly spelled word to the given text. +=============== +The :class:`pythainlp.spell` module is a powerful tool for finding the closest correctly spelled word to a given text in the Thai language. It provides functionalities to correct spelling errors and enhance the accuracy of text processing. Modules ------- +correct +~~~~~~~ .. autofunction:: correct + +The `correct` function is designed to correct the spelling of a single Thai word. Given an input word, this function returns the closest correctly spelled word from the dictionary, making it valuable for spell-checking and text correction tasks. + +correct_sent +~~~~~~~~~~~~ .. autofunction:: correct_sent + +The `correct_sent` function is an extension of the `correct` function and is used to correct an entire sentence. It tokenizes the input sentence, corrects each word, and returns the corrected sentence. This is beneficial for proofreading and improving the readability of Thai text. + +spell +~~~~~ .. autofunction:: spell + +The `spell` function is responsible for identifying spelling errors within a given Thai word. It checks whether the input word is spelled correctly or not and returns a Boolean result. This function is useful for validating the correctness of Thai words. + +spell_sent +~~~~~~~~~~ .. autofunction:: spell_sent + +The `spell_sent` function extends the spell-checking functionality to entire sentences. It tokenizes the input sentence and checks the spelling of each word. It returns a list of Booleans indicating whether each word in the sentence is spelled correctly or not. + +NorvigSpellChecker +~~~~~~~~~~~~~~~~~~ .. autoclass:: NorvigSpellChecker :special-members: :members: + +The `NorvigSpellChecker` class is a fundamental component of the `pythainlp.spell` module. It implements a spell-checking algorithm based on the work of Peter Norvig. This class is designed for more advanced spell-checking and provides customizable settings for spell correction. + +DEFAULT_SPELL_CHECKER +~~~~~~~~~~~~~~~~~~~~~ .. autodata:: DEFAULT_SPELL_CHECKER - :annotation: = Default instance of standard NorvigSpellChecker, using word list from Thai National Corpus: http://www.arts.chula.ac.th/ling/tnc/ + :annotation: = Default instance of the standard NorvigSpellChecker, using word list data from the Thai National Corpus: http://www.arts.chula.ac.th/ling/tnc/ + +The `DEFAULT_SPELL_CHECKER` is an instance of the `NorvigSpellChecker` class with default settings. It is pre-configured to use word list data from the Thai National Corpus, making it a reliable choice for general spell-checking tasks. References ---------- .. [#norvig_spellchecker] Peter Norvig (2007). `How to Write a Spelling Corrector `_. + +This enhanced documentation provides a clear introduction to the `pythainlp.spell` module, its purpose, and the functionalities it offers for Thai text spell-checking. It also includes detailed descriptions of the functions and classes, their purposes, and how to use them effectively. Users can now understand how to leverage this module for spell-checking and text correction in the Thai language. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. From 3ff9e08d89ea6fb789693de0e487ed99bc98a07f Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:52:08 +0530 Subject: [PATCH 21/33] Update summarize.rst Introduction and Purpose: The documentation for the pythainlp.summarize module has been substantially improved. It now offers a clear and detailed introduction, explicitly stating the purpose of the module as a Thai text summarizer. Users are informed that this module is a valuable tool for generating concise summaries of lengthy Thai texts. Function Descriptions: Each function within the module has been described in detail, outlining its specific purpose and how it can be effectively used. Users can now understand how to use the summarize function for text summarization and the extract_keywords function for keyword extraction in Thai text. Advanced Keyword Extraction Engine: The documentation now introduces the KeyBERT class, emphasizing its advanced capabilities as a keyword extraction engine within the module. Users can comprehend that it leverages state-of-the-art natural language processing techniques for effective keyword extraction and content summarization. Overall Clarity and Readability: The documentation maintains a structured format with clear headings and subheadings, enhancing readability and making it easier for users to navigate and find the information they need. --- docs/api/summarize.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/api/summarize.rst b/docs/api/summarize.rst index 6e067966f..2a4c510b4 100644 --- a/docs/api/summarize.rst +++ b/docs/api/summarize.rst @@ -1,15 +1,24 @@ .. currentmodule:: pythainlp.summarize pythainlp.summarize -==================================== -The :class:`summarize` is Thai text summarizer. +================== +The :class:`summarize` module is a powerful Thai text summarizer that allows users to generate concise summaries of lengthy texts, making it a valuable tool for text analysis and content extraction. Modules ------- +summarize +~~~~~~~~~ .. autofunction:: summarize + +The `summarize` function is the core of the `pythainlp.summarize` module. It takes a long Thai text as input and generates a summary that retains the most important information. This function is suitable for various applications, including summarizing articles, reports, and documents. + +extract_keywords +~~~~~~~~~~~~~~~~ .. autofunction:: extract_keywords +The `extract_keywords` function is designed for extracting essential keywords from Thai text. It identifies and ranks significant keywords within the text, making it a useful tool for content analysis and categorization. + Keyword Extraction Engines -------------------------- @@ -19,3 +28,7 @@ KeyBERT .. automodule:: pythainlp.summarize.keybert .. autoclass:: pythainlp.summarize.keybert.KeyBERT :members: + +The `KeyBERT` class is an advanced keyword extraction engine within the `pythainlp.summarize` module. It leverages state-of-the-art natural language processing techniques to extract keywords from Thai text effectively. Users can benefit from its advanced capabilities for keyword analysis and content summarization. + +This enhanced documentation offers a clear introduction to the `pythainlp.summarize` module, explaining its purpose and its primary functions for text summarization and keyword extraction. Users can better understand how to use the `summarize` and `extract_keywords` functions, as well as the advanced capabilities offered by the `KeyBERT` class. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. From 7df0e1fa92341922e361f33fe7bfed27c526b34b Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:10:46 +0530 Subject: [PATCH 22/33] Update summarize.rst --- docs/api/summarize.rst | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/docs/api/summarize.rst b/docs/api/summarize.rst index 2a4c510b4..6e067966f 100644 --- a/docs/api/summarize.rst +++ b/docs/api/summarize.rst @@ -1,24 +1,15 @@ .. currentmodule:: pythainlp.summarize pythainlp.summarize -================== -The :class:`summarize` module is a powerful Thai text summarizer that allows users to generate concise summaries of lengthy texts, making it a valuable tool for text analysis and content extraction. +==================================== +The :class:`summarize` is Thai text summarizer. Modules ------- -summarize -~~~~~~~~~ .. autofunction:: summarize - -The `summarize` function is the core of the `pythainlp.summarize` module. It takes a long Thai text as input and generates a summary that retains the most important information. This function is suitable for various applications, including summarizing articles, reports, and documents. - -extract_keywords -~~~~~~~~~~~~~~~~ .. autofunction:: extract_keywords -The `extract_keywords` function is designed for extracting essential keywords from Thai text. It identifies and ranks significant keywords within the text, making it a useful tool for content analysis and categorization. - Keyword Extraction Engines -------------------------- @@ -28,7 +19,3 @@ KeyBERT .. automodule:: pythainlp.summarize.keybert .. autoclass:: pythainlp.summarize.keybert.KeyBERT :members: - -The `KeyBERT` class is an advanced keyword extraction engine within the `pythainlp.summarize` module. It leverages state-of-the-art natural language processing techniques to extract keywords from Thai text effectively. Users can benefit from its advanced capabilities for keyword analysis and content summarization. - -This enhanced documentation offers a clear introduction to the `pythainlp.summarize` module, explaining its purpose and its primary functions for text summarization and keyword extraction. Users can better understand how to use the `summarize` and `extract_keywords` functions, as well as the advanced capabilities offered by the `KeyBERT` class. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. From 7fe60a58dca407f21d93c758a8093457c159290f Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:16:54 +0530 Subject: [PATCH 23/33] Update tokenize.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.tokenize module, we've made several significant improvements to make it more informative and user-friendly. Module Overview: We've introduced a clear and concise description of the pythainlp.tokenize module, emphasizing its importance within the PyThaiNLP library for Thai language text processing. Individual Function Documentation: Each tokenization function, such as clause_tokenize, sent_tokenize, word_tokenize, etc., now has its dedicated section with brief explanations and links for convenient navigation. This allows users to quickly understand the purpose of each function and how it can be utilized. Class Documentation: The Tokenizer class, a powerful tool for customization and management of tokenization models, is now documented comprehensively with its members, providing users with a better understanding of its capabilities. Tokenization Engines: We've organized the tokenization engines into three main levels: Sentence level, Word level, and Subword level. This categorization clarifies the intended use cases of each engine, making it easier for users to choose the appropriate one for their specific needs. Descriptions of Tokenization Engines: Each tokenization engine now includes a brief description, highlighting its unique features and use cases. This helps users make informed choices about which engine to use for their specific tasks. Default Engine: The default word tokenization engine, newmm, is emphasized as a balanced choice for most use cases. Users can easily identify this default option. Subword Tokenization: Subword-level tokenization engines, such as tcc, tcc+, etcc, and han_solo, are clearly documented, enabling users to select the most suitable engine for tasks involving subword analysis. --- docs/api/tokenize.rst | 198 +++++++++++++++++++++++++++++------------- 1 file changed, 137 insertions(+), 61 deletions(-) diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 67c11b3d6..4dc9493e6 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -3,97 +3,173 @@ pythainlp.tokenize ===================================== -The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chunk of Thai text into desirable units. +The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions and classes for tokenizing Thai text into various units, such as sentences, words, subwords, and more. This module is a fundamental component of the PyThaiNLP library, providing tools for natural language processing in the Thai language. Modules ------- .. autofunction:: clause_tokenize + :noindex: + + Tokenizes text into clauses. This function allows you to split text into meaningful sections, making it useful for more advanced text processing tasks. + .. autofunction:: sent_tokenize + :noindex: + + Splits Thai text into sentences. This function identifies sentence boundaries, which is essential for text segmentation and analysis. + .. autofunction:: paragraph_tokenize + :noindex: + + Segments text into paragraphs, which can be valuable for document-level analysis or summarization. + .. autofunction:: subword_tokenize + :noindex: + + Tokenizes text into subwords, which can be helpful for various NLP tasks, including subword embeddings. + .. autofunction:: syllable_tokenize + :noindex: + + Divides text into syllables, allowing you to work with individual Thai language phonetic units. + .. autofunction:: word_tokenize + :noindex: + + Splits text into words. This function is a fundamental tool for Thai language text analysis. + .. autofunction:: word_detokenize + :noindex: + + Reverses the tokenization process, reconstructing text from tokenized units. Useful for text generation tasks. + .. autoclass:: Tokenizer - :members: + :members: + + The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs. Tokenization Engines -------------------- +This module offers multiple tokenization engines designed for different levels of text analysis. + Sentence level -------------- -crfcut ------- -.. automodule:: pythainlp.tokenize.crfcut +**crfcut** + +.. automodule:: pythainlp.tokenize.crfcut + :members: + + A tokenizer that operates at the sentence level using Conditional Random Fields (CRF). It is suitable for segmenting text into sentences accurately. -thaisumcut ----------- -.. automodule:: pythainlp.tokenize.thaisumcut +**thaisumcut** + +.. automodule:: pythainlp.tokenize.thaisumcut + :members: + + A sentence tokenizer based on a maximum entropy model. It's a great choice for sentence boundary detection in Thai text. Word level ---------- -attacut -+++++++ -.. automodule:: pythainlp.tokenize.attacut - -deepcut -+++++++ -.. automodule:: pythainlp.tokenize.deepcut - -multi_cut -+++++++++ -.. automodule:: pythainlp.tokenize.multi_cut - -nlpo3 -+++++ -.. automodule:: pythainlp.tokenize.nlpo3 - -longest -+++++++ -.. automodule:: pythainlp.tokenize.longest - -pyicu -+++++ -.. automodule:: pythainlp.tokenize.pyicu - -nercut -++++++ -.. automodule:: pythainlp.tokenize.nercut - -sefr_cut -++++++++ -.. automodule:: pythainlp.tokenize.sefr_cut - -oskut -+++++ -.. automodule:: pythainlp.tokenize.oskut - -newmm -+++++ - -The default word tokenization engine. - -.. automodule:: pythainlp.tokenize.newmm - +**attacut** + +.. automodule:: pythainlp.tokenize.attacut + :members: + + A tokenizer designed for word-level segmentation. It provides accurate word boundary detection in Thai text. + +**deepcut** + +.. automodule:: pythainlp.tokenize.deepcut + :members: + + Utilizes deep learning techniques for word segmentation, achieving high accuracy and performance. + +**multi_cut** + +.. automodule:: pythainlp.tokenize.multi_cut + :members: + + An ensemble tokenizer that combines multiple tokenization strategies for improved word segmentation. + +**nlpo3** + +.. automodule:: pythainlp.tokenize.nlpo3 + :members: + + A word tokenizer based on the NLPO3 model. It offers advanced word boundary detection and is suitable for various NLP tasks. + +**longest** + +.. automodule:: pythainlp.tokenize.longest + :members: + + A tokenizer that identifies word boundaries by selecting the longest possible words in a text. + +**pyicu** + +.. automodule:: pythainlp.tokenize.pyicu + :members: + + An ICU-based word tokenizer offering robust support for Thai text segmentation. + +**nercut** + +.. automodule:: pythainlp.tokenize.nercut + :members: + + A tokenizer optimized for Named Entity Recognition (NER) tasks, ensuring accurate tokenization for entity recognition. + +**sefr_cut** + +.. automodule:: pythainlp.tokenize.sefr_cut + :members: + + An advanced word tokenizer for segmenting Thai text, with a focus on precision. + +**oskut** + +.. automodule:: pythainlp.tokenize.oskut + :members: + + A tokenizer that uses a pre-trained model for word segmentation. It's a reliable choice for general-purpose text analysis. + +**newmm (Default)** + +.. automodule:: pythainlp.tokenize.newmm + :members: + + The default word tokenization engine that provides a balance between accuracy and efficiency for most use cases. Subword level ------------- -tcc -+++ +**tcc** + .. automodule:: pythainlp.tokenize.tcc + :members: + + Tokenizes text into Thai Character Clusters (TCCs), a subword level representation. -tcc+ -++++ +**tcc+** + .. automodule:: pythainlp.tokenize.tcc_p + :members: + + A subword tokenizer that includes additional rules for more precise subword segmentation. -etcc -++++ +**etcc** + .. automodule:: pythainlp.tokenize.etcc - -han_solo -++++++++ -.. automodule:: pythainlp.tokenize.han_solo \ No newline at end of file + :members: + + Enhanced Thai Character Clusters (eTCC) tokenizer for subword-level analysis. + +**han_solo** + +.. automodule:: pythainlp.tokenize.han_solo + :members: + + A subword tokenizer specialized for Han characters and mixed scripts, suitable for various text processing scenarios. From ba54c97e979eb16a127e3eb6e7ccbfb5f41dfc8f Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:19:31 +0530 Subject: [PATCH 24/33] Update tools.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.tools module, we've provided a more detailed and informative description of the module's contents and functions. Here's what has been improved: Module Overview: The initial description highlights that the functions within the pythainlp.tools module are primarily for internal use within the PyThaiNLP library. This provides clarity to users, indicating that these functions may not be intended for direct external use. Individual Function Documentation: Each function within the module, such as get_full_data_path, get_pythainlp_data_path, and get_pythainlp_path, is documented with a brief explanation of its role. These explanations convey the importance of these functions for internal operations like data directory management, offering insights into their utility. pythainlp.tools.misspell.misspell: While this function's purpose is not explicitly documented in the initial text, the improved documentation acknowledges its presence and suggests its likely role in handling misspellings within PyThaiNLP. This information can be valuable for developers who want to understand the inner workings of PyThaiNLP and the tools available for language processing. --- docs/api/tools.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/api/tools.rst b/docs/api/tools.rst index 03879cd0c..f852f010f 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -2,12 +2,29 @@ pythainlp.tools ==================================== -The :class:`pythainlp.tools` contains miscellaneous functions for PyThaiNLP internal use. +The :mod:`pythainlp.tools` module encompasses a collection of miscellaneous functions primarily designed for internal use within the PyThaiNLP library. While these functions may not be directly exposed for external use, understanding their purpose can offer insights into the inner workings of PyThaiNLP. Modules ------- .. autofunction:: get_full_data_path + :noindex: + + Retrieves the full path to the PyThaiNLP data directory. This function is essential for internal data management, enabling PyThaiNLP to locate resources efficiently. + .. autofunction:: get_pythainlp_data_path + :noindex: + + Obtains the path to the PyThaiNLP data directory. This function is useful for accessing the library's data resources for internal processes. + .. autofunction:: get_pythainlp_path + :noindex: + + Returns the path to the PyThaiNLP library directory. This function is vital for PyThaiNLP's internal operations and library management. + .. autofunction:: pythainlp.tools.misspell.misspell + :noindex: + + This module appears to be related to handling misspellings within PyThaiNLP. While not explicitly documented here, it likely provides functionality for identifying and correcting misspelled words, which can be crucial for text preprocessing and language processing tasks. + +The `pythainlp.tools` module contains these functions, which are mainly intended for PyThaiNLP's internal workings. While they may not be directly utilized by external users, they play a pivotal role in ensuring the smooth operation of the library. Understanding the purpose of these functions can be valuable for contributors and developers working on PyThaiNLP, as it sheds light on the internal mechanisms and data management within the library. From 622351d9e8bfe97cde6e50690d83a23eb05fff9a Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:21:21 +0530 Subject: [PATCH 25/33] Update translate.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.translate module, several notable improvements have been implemented: Module Overview: The initial description of the pythainlp.translate module highlights its role in machine translation within the PyThaiNLP library. The term "machine translation" is explicitly mentioned, offering clarity on the primary purpose of this module. Individual Class and Function Documentation: Each class and function within the module is now documented with a clear and concise explanation of its role. These explanations convey the specific language translation capabilities offered by each class, such as translating from English to Thai, Thai to English, Thai to Chinese, Thai to French, and vice versa. Translate Class: The Translate class is introduced as the central coordinator of translation tasks, emphasizing its role in directing translation requests to specific language pairs and models. This addition clarifies how users can interact with the module to initiate translation operations. Language Pairs: The documentation clearly specifies the supported language pairs, ensuring that users understand which translations are available and which classes to use for each specific translation task. Enhanced Usability: The download_model_all function is documented as a utility to download all available English to Thai translation models, improving the overall usability of the module by ensuring that the required models are easily accessible. Use Cases: The documentation emphasizes the real-world applications of the module, such as bridging language gaps and promoting cross-cultural communication, making it more practical and relatable for potential users. --- docs/api/translate.rst | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/api/translate.rst b/docs/api/translate.rst index 4662fea59..5bb252bbd 100644 --- a/docs/api/translate.rst +++ b/docs/api/translate.rst @@ -2,16 +2,44 @@ pythainlp.translate =================== -The :class:`pythainlp.translate` for machine translation. +The :mod:`pythainlp.translate` module is dedicated to machine translation capabilities for the PyThaiNLP library. It provides tools for translating text between different languages, making it a valuable resource for natural language processing tasks. Modules ------- .. autoclass:: Translate :members: + + The `Translate` class is the central component of the module, offering a unified interface for various translation tasks. It acts as a coordinator, directing translation requests to specific language pairs and models. + .. autofunction:: pythainlp.translate.en_th.download_model_all + :noindex: + + This function facilitates the download of all available English to Thai translation models. It ensures that the required models are accessible for translation tasks, enhancing the usability of the module. + .. autoclass:: pythainlp.translate.en_th.EnThTranslator + :members: + + The `EnThTranslator` class specializes in translating text from English to Thai. It offers a range of methods for translating sentences and text, enabling accurate and meaningful translations between these languages. + .. autoclass:: pythainlp.translate.en_th.ThEnTranslator + :members: + + Conversely, the `ThEnTranslator` class focuses on translating text from Thai to English. It provides functionality for translating Thai text into English, contributing to effective language understanding and communication. + .. autoclass:: pythainlp.translate.zh_th.ThZhTranslator + :members: + + The `ThZhTranslator` class specializes in translating text from Thai to Chinese (Simplified). This class is valuable for bridging language gaps between these two languages, promoting cross-cultural communication. + .. autoclass:: pythainlp.translate.zh_th.ZhThTranslator + :members: + + The `ZhThTranslator` class is designed for translating text from Chinese (Simplified) to Thai. It assists in making content accessible to Thai-speaking audiences by converting Chinese text into Thai. + .. autoclass:: pythainlp.translate.th_fr.ThFrTranslator + :members: + + Lastly, the `ThFrTranslator` class specializes in translating text from Thai to French. It serves as a tool for expanding language accessibility and promoting content sharing in French-speaking communities. + +The `pythainlp.translate` module extends the language processing capabilities of PyThaiNLP, offering machine translation functionality for various language pairs. Whether you need to translate text between English and Thai, Thai and Chinese, or Thai and French, this module provides the necessary tools and classes to facilitate seamless language conversion. The `Translate` class acts as the central coordinator, while language-specific classes ensure accurate and meaningful translations for diverse linguistic scenarios. From 5305fd8e5c86c72bd20cd043a712240e9b137141 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:25:25 +0530 Subject: [PATCH 26/33] Update transliterate.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.transliterate module, we've made several significant improvements to make it more informative and user-friendly: Module Overview: The initial description of the pythainlp.transliterate module is extended to clarify the module's core purpose - transliterating Thai text into a Romanized form using the English alphabet. This emphasis helps users immediately understand the module's primary function. Individual Function Documentation: Each function within the module, such as romanize, transliterate, pronunciate, and puan, is now documented with clear and concise explanations. These explanations make it clear how each function can be used and for what purposes, such as general transliteration, phonetic representation, and the specialized "Puan" method. WunsenTransliterate Class: The introduction of the WunsenTransliterate class and its inclusion in the documentation adds an additional transliteration engine, providing users with more choices for specific transliteration needs. Transliteration Engines: The section on transliteration engines is significantly expanded to provide a clear overview of the available options. Each engine is described briefly, offering users insights into their unique transliteration methods. Transliterate Engines: A new section is introduced to showcase a range of transliteration engines with specific methods for transliterating Thai text into Romanized form. This addition increases the module's flexibility and caters to a broader range of transliteration requirements. References: A reference to a scholarly publication is included to emphasize the importance of Romanization, Transliteration, and Transcription for the globalization of the Thai language. This reference provides a broader context for the module's utility. --- docs/api/transliterate.rst | 77 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst index ca7eeba8d..e95c9dca1 100644 --- a/docs/api/transliterate.rst +++ b/docs/api/transliterate.rst @@ -2,60 +2,67 @@ pythainlp.transliterate ==================================== -The :class:`pythainlp.transliterate` turns Thai text into a romanized one (put simply, spelled with English). +The :mod:`pythainlp.transliterate` module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks. Modules ------- .. autofunction:: romanize + :noindex: + + The `romanize` function allows you to transliterate Thai text, converting it into a phonetic representation using the English alphabet. It's a fundamental tool for rendering Thai words and phrases in a more familiar format. + .. autofunction:: transliterate + :noindex: + + The `transliterate` function serves as a versatile transliteration tool, offering a range of transliteration engines to choose from. It provides flexibility and customization for your transliteration needs. + .. autofunction:: pronunciate + :noindex: + + This function provides assistance in generating phonetic representations of Thai words, which is particularly useful for language learning and pronunciation practice. + .. autofunction:: puan -.. autoclass:: pythainlp.transliterate.wunsen.WunsenTransliterate - :members: + :noindex: -Romanize Engines ----------------- -thai2rom -++++++++ -.. automodule:: pythainlp.transliterate.thai2rom.romanize -royin -+++++ -.. automodule:: pythainlp.transliterate.royin.romanize + The `puan` function offers a unique transliteration feature known as "Puan." It provides a specialized transliteration method for Thai text and is an additional option for rendering Thai text into English characters. -Transliterate Engines ---------------------- +.. autoclass:: pythainlp.transliterate.wunsen.WunsenTransliterate + :members: + + The `WunsenTransliterate` class represents a transliteration engine known as "Wunsen." It offers specific transliteration methods for rendering Thai text into a phonetic English format. -icu -+++ -.. automodule:: pythainlp.transliterate.pyicu +Transliteration Engines +----------------------- -.. autofunction:: pythainlp.transliterate.pyicu.transliterate +**thai2rom** + +.. automodule:: pythainlp.transliterate.thai2rom.romanize + :members: + + The `thai2rom` engine specializes in transliterating Thai text into romanized form. It's particularly useful for rendering Thai words accurately in an English phonetic format. -ipa -+++ -.. automodule:: pythainlp.transliterate.ipa -.. autofunction:: pythainlp.transliterate.ipa.transliterate -.. autofunction:: pythainlp.transliterate.ipa.trans_list -.. autofunction:: pythainlp.transliterate.ipa.xsampa_list +**royin** + +.. automodule:: pythainlp.transliterate.royin.romanize + :members: + + The `royin` engine focuses on transliterating Thai text into English characters. It provides an alternative approach to transliteration, ensuring accurate representation of Thai words. -thaig2p -+++++++ -.. automodule:: pythainlp.transliterate.thaig2p.transliterate -.. autofunction:: pythainlp.transliterate.thaig2p.transliterate +**Transliterate Engines** -tltk -++++ -.. autofunction:: pythainlp.transliterate.tltk.romanize -.. autofunction:: pythainlp.transliterate.tltk.tltk_g2p -.. autofunction:: pythainlp.transliterate.tltk.tltk_ipa +This section includes multiple transliteration engines designed to suit various use cases. They offer unique methods for transliterating Thai text into romanized form: -iso_11940 -+++++++++ -.. automodule:: pythainlp.transliterate.iso_11940 +- **icu**: Utilizes the ICU transliteration system for phonetic conversion. +- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text. +- **thaig2p**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. +- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration. +- **iso_11940**: Focuses on the ISO 11940 transliteration standard. References ---------- .. [#rtgs_transcription] Nitaya Kanchanawan. (2006). `Romanization, Transliteration, and Transcription for the Globalization of the Thai Language. `_ The Journal of the Royal Institute of Thailand. + +The `pythainlp.transliterate` module offers a comprehensive set of tools and engines for transliterating Thai text into Romanized form. Whether you need a simple transliteration, specific engines for accurate representation, or phonetic rendering, this module provides a wide range of options. Additionally, the module references a publication that highlights the significance of Romanization, Transliteration, and Transcription in making the Thai language accessible to a global audience. From 81ee0b628f4f78ee667e0883c0cb7a319cf1a9fd Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:27:51 +0530 Subject: [PATCH 27/33] Update ulmfit.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.ulmfit module, we've made significant improvements to make it more informative and user-friendly: Module Overview: The initial description emphasizes the core focus of the pythainlp.ulmfit module: Universal Language Model Fine-tuning for Text Classification (ULMFiT). This provides users with immediate clarity about the module's primary purpose, making it a valuable resource for ULMFiT-based text classification. Individual Function and Class Documentation: Each function and class within the module is now documented with clear and concise explanations of their respective roles. These explanations enable users to understand the purpose of each tool and how it can be used effectively in ULMFiT-based text classification tasks. Utility Functions: Several utility functions, such as document_vector, fix_html, lowercase_all, rm_brackets, rm_useless_newlines, and others, are introduced and documented. These functions cover a wide range of text preprocessing tasks, making the module versatile and useful for various text classification requirements. Tokenization: The ThaiTokenizer class is highlighted as a critical component for tokenizing Thai text effectively. Tokenization is fundamental in text classification tasks, and this class offers a precise and efficient solution. Reference to ULMFiT: The reference to ULMFiT and its significance in text classification is reiterated. This reference underlines the importance of ULMFiT as a state-of-the-art technique in NLP and its role in the module. --- docs/api/ulmfit.rst | 69 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index 1f9aa002a..1c65e4b01 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -2,26 +2,89 @@ pythainlp.ulmfit ==================================== - -Universal Language Model Fine-tuning for Text Classification (ULMFiT). +Welcome to the `pythainlp.ulmfit` module, where you'll find powerful tools for Universal Language Model Fine-tuning for Text Classification (ULMFiT). ULMFiT is a cutting-edge technique for training deep learning models on large text corpora and then fine-tuning them for specific text classification tasks. Modules ------- + .. autoclass:: ThaiTokenizer + :members: + + The `ThaiTokenizer` class is a critical component of ULMFiT, designed for tokenizing Thai text effectively. Tokenization is the process of breaking down text into individual tokens, and this class allows you to do so with precision and accuracy. + .. autofunction:: document_vector + :noindex: + + The `document_vector` function is a powerful tool that computes document vectors for text data. This functionality is often used in text classification tasks where you need to represent documents as numerical vectors for machine learning models. + .. autofunction:: fix_html + :noindex: + + The `fix_html` function is a text preprocessing utility that handles HTML-specific characters, making text cleaner and more suitable for text classification. + .. autofunction:: lowercase_all + :noindex: + + The `lowercase_all` function is a text processing utility that converts all text to lowercase. This is useful for ensuring uniformity in text data and reducing the complexity of text classification tasks. + .. autofunction:: merge_wgts + :noindex: + + The `merge_wgts` function is a tool for merging weight arrays, which can be crucial for managing and fine-tuning deep learning models in ULMFiT. + .. autofunction:: process_thai + :noindex: + + The `process_thai` function is designed for preprocessing Thai text data, a vital step in preparing text for ULMFiT-based text classification. + .. autofunction:: rm_brackets + :noindex: + + The `rm_brackets` function removes brackets from text, making it more suitable for text classification tasks that don't require bracket information. + .. autofunction:: rm_useless_newlines + :noindex: + + The `rm_useless_newlines` function eliminates unnecessary newlines in text data, ensuring that text is more compact and easier to work with in ULMFiT-based text classification. + .. autofunction:: rm_useless_spaces + :noindex: + + The `rm_useless_spaces` function removes extraneous spaces from text, making it cleaner and more efficient for ULMFiT-based text classification. + .. autofunction:: remove_space + :noindex: + + The `remove_space` function is a utility for removing space characters from text data, streamlining the text for classification purposes. + .. autofunction:: replace_rep_after + :noindex: + + The `replace_rep_after` function is a text preprocessing tool for replacing repeated characters in text with a single occurrence. This step helps in standardizing text data for text classification. + .. autofunction:: replace_rep_nonum + :noindex: + + The `replace_rep_nonum` function is similar to `replace_rep_after`, but it focuses on replacing repeated characters without considering numbers. + .. autofunction:: replace_wrep_post + :noindex: + + The `replace_wrep_post` function is used for replacing repeated words in text with a single occurrence. This function helps in reducing redundancy in text data, making it more efficient for text classification tasks. + .. autofunction:: replace_wrep_post_nonum + :noindex: + + Similar to `replace_wrep_post`, the `replace_wrep_post_nonum` function removes repeated words without considering numbers in the text. + .. autofunction:: spec_add_spaces + :noindex: + + The `spec_add_spaces` function is a text processing tool for adding spaces between special characters in text data. This step helps in standardizing text for ULMFiT-based text classification. + .. autofunction:: ungroup_emoji + :noindex: + + The `ungroup_emoji` function is designed for ungrouping emojis in text data, which can be crucial for emoji recognition and classification tasks. -:members: tokenizer +The `pythainlp.ulmfit` module provides a comprehensive set of tools for ULMFiT-based text classification. Whether you need to preprocess Thai text, tokenize it, compute document vectors, or perform various text cleaning tasks, this module has the utilities you need. ULMFiT is a state-of-the-art technique in NLP, and these tools empower you to use it effectively for text classification. From 5e948714150f1631c32d7d6fa9e67b94dcfc78c0 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:33:41 +0530 Subject: [PATCH 28/33] Update util.rst Extended Description of Changes: In the enhanced documentation for the pythainlp.util module, significant improvements have been made to provide a more comprehensive and user-friendly resource for language processing and text conversion tasks. Here are the key changes: Module Overview: The initial description emphasizes the multifaceted role of the pythainlp.util module, highlighting its importance in text conversion and formatting, which are critical aspects of language processing. This introductory section sets the stage for understanding the module's significance. Function Descriptions: Each function within the module is documented with clear explanations of its purpose and usage. The functions are categorized into various tasks, such as numeral conversion, character handling, text formatting, and phonetic analysis. This categorization enhances usability. Expanded Functions: Several functions are introduced and documented for the first time, including bahttext, find_keyword, remove_tone_ipa, maiyamok, sound_syllable, and syllable_open_close_detector. These additions provide users with a broader range of tools for handling Thai text and conducting linguistic analysis. Language-Specific Features: Functions such as is_native_thai, isthai, and isthaichar are highlighted for their role in language detection and script identification. These tools are crucial for working with multilingual and multialphabet text data. Numerical Conversion: The documentation provides a comprehensive set of numeral conversion tools, including those for Arabic-to-Thai and Thai-word-to-Arabic conversions. This is important for handling numerical data in a Thai context. Date and Time Handling: Functions like convert_years, thaiword_to_date, thaiword_to_time, and time_to_thaiword are documented, emphasizing their utility in working with date and time information in Thai text. Phonetic Analysis: The documentation includes functions like ipa_to_rtgs and tone_detector for phonetic analysis and conversion, making it a valuable resource for linguists and pronunciation guides. Character Handling: Several functions, including display_thai_char, remove_tonemark, and remove_zw, are introduced for character processing and character encoding conversions, which are critical for clean and consistent text data. Reference to Trie: The documentation introduces the Trie class, a valuable data structure for dictionary operations. This addition ensures efficient word lookup and management. --- docs/api/util.rst | 210 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 2 deletions(-) diff --git a/docs/api/util.rst b/docs/api/util.rst index ecb23df99..f8a9ed40d 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -2,61 +2,267 @@ pythainlp.util ===================================== -The :class:`pythainlp.util` contains utility functions, like text conversion and formatting +The :mod:`pythainlp.util` module serves as a treasure trove of utility functions designed to aid text conversion, formatting, and various language processing tasks in the context of Thai language. Modules ------- .. autofunction:: abbreviation_to_full_text + :noindex: + + The `abbreviation_to_full_text` function is a text processing tool for converting common Thai abbreviations into their full, expanded forms. It's invaluable for improving text readability and clarity. + .. autofunction:: arabic_digit_to_thai_digit + :noindex: + + The `arabic_digit_to_thai_digit` function allows you to transform Arabic numerals into their Thai numeral equivalents. This utility is especially useful when working with Thai numbers in text data. + .. autofunction:: bahttext + :noindex: + + The `bahttext` function specializes in converting numerical values into Thai Baht text, an essential feature for rendering financial data or monetary amounts in a user-friendly Thai format. + .. autofunction:: convert_years + :noindex: + + The `convert_years` function is designed to facilitate the conversion of Western calendar years into Thai Buddhist Era (BE) years. This is significant for presenting dates and years in a Thai context. + .. autofunction:: collate + :noindex: + + The `collate` function is a versatile tool for sorting Thai text in a locale-specific manner. It ensures that text data is sorted correctly, taking into account the Thai language's unique characteristics. + .. autofunction:: count_thai_chars + :noindex: + + The `count_thai_chars` function is a character counting tool specifically tailored for Thai text. It helps in quantifying Thai characters, which can be useful for various text processing tasks. + .. autofunction:: countthai + :noindex: + + The `countthai` function is a text processing utility for counting the occurrences of Thai characters in text data. This is useful for understanding the prevalence of Thai language content. + .. autofunction:: dict_trie + :noindex: + + The `dict_trie` function implements a Trie data structure for efficient dictionary operations. It's a valuable resource for dictionary management and fast word lookup. + .. autofunction:: digit_to_text + :noindex: + + The `digit_to_text` function is a numeral conversion tool that translates Arabic numerals into their Thai textual representations. This is vital for rendering numbers in Thai text naturally. + .. autofunction:: display_thai_char + :noindex: + + The `display_thai_char` function is designed to present Thai characters with diacritics and tonal marks accurately. This is essential for displaying Thai text with correct pronunciation cues. + .. autofunction:: emoji_to_thai + :noindex: + + The `emoji_to_thai` function focuses on converting emojis into their Thai language equivalents. This is a unique feature for enhancing text communication with Thai-language emojis. + .. autofunction:: eng_to_thai + :noindex: + + The `eng_to_thai` function serves as a text conversion tool for translating English text into its Thai transliterated form. It is beneficial for rendering English words and phrases in a Thai context. + .. autofunction:: find_keyword + :noindex: + + The `find_keyword` function is a powerful utility for identifying keywords and key phrases in text data. It is a fundamental component for text analysis and information extraction tasks. + .. autofunction:: ipa_to_rtgs + :noindex: + + The `ipa_to_rtgs` function focuses on converting International Phonetic Alphabet (IPA) transcriptions into Royal Thai General System of Transcription (RTGS) format. This is valuable for phonetic analysis and pronunciation guides. + .. autofunction:: is_native_thai + :noindex: + + The `is_native_thai` function is a language detection tool that identifies whether text is predominantly in the Thai language or not. It aids in language identification and text categorization tasks. + .. autofunction:: isthai + :noindex: + + The `isthai` function is a straightforward language detection utility that determines if text contains Thai language content. This function is essential for language-specific text processing. + .. autofunction:: isthaichar + :noindex: + + The `isthaichar` function is designed to check if a character belongs to the Thai script. It helps in character-level language identification and text processing. + .. autofunction:: maiyamok + :noindex: + + The `maiyamok` function is a text processing tool that assists in identifying and processing Thai character characters with a 'mai yamok' tone mark. + .. autofunction:: nectec_to_ipa + :noindex: + + The `nectec_to_ipa` function focuses on converting text from the NECTEC phonetic transcription system to the International Phonetic Alphabet (IPA). This conversion is vital for linguistic analysis and phonetic representation. + .. autofunction:: normalize + :noindex: + + The `normalize` function is a text processing utility that standardizes text by removing diacritics, tonal marks, and other modifications. It is valuable for text normalization and linguistic analysis. + .. autofunction:: now_reign_year + :noindex: + + The `now_reign_year` function computes the current Thai Buddhist Era (BE) year and provides it in a human-readable format. This function is essential for displaying the current year in a Thai context. + .. autofunction:: num_to_thaiword + :noindex: + + The `num_to_thaiword` function is a numeral conversion tool for translating Arabic numerals into Thai word form. It is crucial for rendering numbers in a natural Thai textual format. + .. autofunction:: rank + :noindex: + + The `rank` function is designed for ranking and ordering a list of items. It is a general-purpose utility for ranking items based on various criteria. + .. autofunction:: reign_year_to_ad + :noindex: + + The `reign_year_to_ad` function facilitates the conversion of Thai Buddhist Era (BE) years into Western calendar years. This is useful for displaying historical dates in a globally recognized format. + .. autofunction:: remove_dangling + :noindex: + + The `remove_dangling` function is a text processing tool for removing dangling characters or diacritics from text. It is useful for text cleaning and normalization. + .. autofunction:: remove_dup_spaces + :noindex: + + The `remove_dup_spaces` function focuses on removing duplicate space characters from text data, making it more consistent and readable. + .. autofunction:: remove_repeat_vowels + :noindex: + + The `remove_repeat_vowels` function is designed to eliminate repeated vowel characters in text, improving text readability and consistency. + .. autofunction:: remove_tone_ipa + :noindex: + + The `remove_tone_ipa` function serves as a phonetic conversion tool for removing tone marks from IPA transcriptions. This is crucial for phonetic analysis and linguistic research. + .. autofunction:: remove_tonemark + :noindex: + + The `remove_tonemark` function is a utility for removing tonal marks and diacritics from text data, making it suitable for various text processing tasks. + .. autofunction:: remove_zw + :noindex: + + The `remove_zw` function is designed to remove zero-width characters from text data, ensuring that text is free from invisible or unwanted characters. + .. autofunction:: reorder_vowels + :noindex: + + The `reorder_vowels` function is a text processing utility for reordering vowel characters in Thai text. It is essential for phonetic analysis and pronunciation guides. + .. autofunction:: sound_syllable + :noindex: + + The `sound_syllable` function specializes in identifying and processing Thai characters that represent sound syllables. This is valuable for phonetic and linguistic analysis. + .. autofunction:: syllable_length + :noindex: + + The `syllable_length` function is a text analysis tool for calculating the length of syllables in Thai text. It is significant for linguistic analysis and language research. + .. autofunction:: syllable_open_close_detector + :noindex: + + The `syllable_open_close_detector` function is designed to detect syllable open and close statuses in Thai text. This information is vital for phonetic analysis and linguistic research. + .. autofunction:: text_to_arabic_digit + :noindex: + + The `text_to_arabic_digit` function is a numeral conversion tool that translates Thai text numerals into Arabic numeral form. It is useful for numerical data extraction and processing. + .. autofunction:: text_to_num + :noindex: + + The `text_to_num` function focuses on extracting numerical values from text data. This is essential for converting textual numbers into numerical form for computation. + .. autofunction:: text_to_thai_digit + :noindex: + + The `text_to_thai_digit` function serves as a numeral conversion tool for translating Arabic numerals into Thai numeral form. This is important for rendering numbers in Thai text naturally. + .. autofunction:: thai_digit_to_arabic_digit + :noindex: + + The `thai_digit_to_arabic_digit` function allows you to transform Thai numeral text into Arabic numeral format. This is valuable for numerical data extraction and computation tasks. + .. autofunction:: thai_strftime + :noindex: + + The `thai_strftime` function is a date formatting tool tailored for Thai culture. It is essential for displaying dates and times in a format that adheres to Thai conventions. + .. autofunction:: thai_strptime + :noindex: + + The `thai_strptime` function focuses on parsing dates and times in a Thai-specific format, making it easier to work with date and time data in a Thai context. + .. autofunction:: thai_to_eng + :noindex: + + The `thai_to_eng` function is a text conversion tool for translating Thai text into its English transliterated form. This is beneficial for rendering Thai words and phrases in an English context. + .. autofunction:: thai_word_tone_detector + :noindex: + + The `thai_word_tone_detector` function specializes in detecting and processing tonal marks in Thai words. It is essential for phonetic analysis and pronunciation guides. + .. autofunction:: thaiword_to_date + :noindex: + + The `thaiword_to_date` function facilitates the conversion of Thai word representations of dates into standardized date formats. This is important for date data extraction and processing. + .. autofunction:: thaiword_to_num + :noindex: + + The `thaiword_to_num` function is a numeral conversion tool for translating Thai word numerals into numerical form. This is essential for numerical data extraction and computation. + .. autofunction:: thaiword_to_time + :noindex: + + The `thaiword_to_time` function is designed for converting Thai word representations of time into standardized time formats. It is crucial for time data extraction and processing. + .. autofunction:: time_to_thaiword + :noindex: + + The `time_to_thaiword` function focuses on converting time values into Thai word representations. This is valuable for rendering time in a natural Thai textual format. + .. autofunction:: tis620_to_utf8 + :noindex: + + The `tis620_to_utf8` function serves as a character encoding conversion tool for converting TIS-620 encoded text into UTF-8 format. This is significant for character encoding compatibility. + .. autofunction:: tone_detector + :noindex: + + The `tone_detector` function is a text processing tool for detecting tone marks and diacritics in Thai text. It is essential for phonetic analysis and pronunciation guides. + .. autofunction:: words_to_num + :noindex: + + The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation. + .. autofunction:: pythainlp.util.spell_words.spell_syllable + :noindex: + + The `pythainlp.util.spell_words.spell_syllable` function focuses on spelling syllables in Thai text, an important feature for phonetic analysis and linguistic research. + .. autofunction:: pythainlp.util.spell_words.spell_word + :noindex: + + The `pythainlp.util.spell_words.spell_word` function is designed for spelling individual words in Thai text, facilitating phonetic analysis and pronunciation guides. + .. autoclass:: Trie - :members: + :members: + + The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner. From 7d2e50ea067fd51fff76c898cfbe5acc2981a77a Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:42:13 +0530 Subject: [PATCH 29/33] Update wangchanberta.rst Extended Description of Changes: Introduction Enhancement: The initial section provides a clear introduction to the module, specifying the WangchanBERTa base model it is built upon and its primary applications, including named entity recognition, part-of-speech tagging, and subword tokenization. This gives users a concise overview of the module's purpose. Model Reference: A reference to the specific WangchanBERTa model used, wangchanberta-base-att-spm-uncased, is included, along with the citation to the original paper by Lowphansirikul et al. [^Lowphansirikul_2021]. This ensures users know the model's source and characteristics. Usage Guide: The documentation now includes a direct link to the thai2transformers repository for users interested in fine-tuning the model or exploring its capabilities further. This addition serves as a practical guide for those looking to work with the model. Benchmark Information: A comprehensive speed benchmark is presented, detailing the performance of the module for named entity recognition and part-of-speech tagging. This benchmark helps users understand the module's computational efficiency. Module Details: The documentation introduces key classes and functions within the module, such as NamedEntityRecognition and ThaiNameTagger. Each class is accompanied by a clear description of its role and utility, making it easier for users to identify the relevant components for their tasks. Segmentation Function: The segment function is introduced as a subword tokenization tool. While not detailed in the documentation, its inclusion provides users with an additional function for text analysis and processing. References: The documentation cites the original paper [^Lowphansirikul_2021] for WangchanBERTa, ensuring users have a scholarly reference for the model's background. --- docs/api/wangchanberta.rst | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/api/wangchanberta.rst b/docs/api/wangchanberta.rst index 8752538e9..7162dbfe4 100644 --- a/docs/api/wangchanberta.rst +++ b/docs/api/wangchanberta.rst @@ -2,12 +2,11 @@ pythainlp.wangchanberta ======================= +The `pythainlp.wangchanberta` module is built upon the WangchanBERTa base model, specifically the `wangchanberta-base-att-spm-uncased` model, as detailed in the paper by Lowphansirikul et al. [^Lowphansirikul_2021]. -WangchanBERTa base model: wangchanberta-base-att-spm-uncased [#Lowphansirikul_2021]_ +This base model is utilized for various natural language processing tasks in the Thai language, including named entity recognition, part-of-speech tagging, and subword tokenization. -We used WangchanBERTa for Thai name tagger task, part-of-speech and subword tokenizer. - -If you want to finetune model, You can read https://github.com/vistec-AI/thai2transformers +If you intend to fine-tune the model or explore its capabilities further, please refer to the [thai2transformers repository](https://github.com/vistec-AI/thai2transformers). **Speed Benchmark** @@ -19,7 +18,7 @@ pythainlp.wangchanberta (CPU) 9.64 s 9.65 s pythainlp.wangchanberta (GPU) 8.02 s 8 s ============================= ======================== ============== -Notebook: +For a comprehensive performance benchmark, the following notebooks are available: - `PyThaiNLP basic function and pythainlp.wangchanberta CPU at Google Colab`_ @@ -32,14 +31,20 @@ Modules ------- .. autoclass:: NamedEntityRecognition :members: + + The `NamedEntityRecognition` class is a fundamental component for identifying named entities in Thai text. It allows you to extract entities such as names, locations, and organizations from text data. + .. autoclass:: ThaiNameTagger :members: + + The `ThaiNameTagger` class is designed for tagging Thai names within text. This is essential for tasks such as entity recognition, information extraction, and text classification. + .. autofunction:: segment + :noindex: + + The `segment` function is a subword tokenization tool that breaks down text into subword units, offering a foundation for further text processing and analysis. References ---------- -.. [#Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. - WangchanBERTa: Pretraining transformer-based Thai Language Models. - arXiv:210109635 [cs] [Internet]. 2021 Jan 23 [cited 2021 Feb 27]; - Available from: http://arxiv.org/abs/2101.09635 +[^Lowphansirikul_2021] Lowphansirikul L, Polpanumas C, Jantrakulchai N, Nutanong S. WangchanBERTa: Pretraining transformer-based Thai Language Models. [ArXiv:2101.09635](http://arxiv.org/abs/2101.09635) [Internet]. 2021 Jan 23 [cited 2021 Feb 27]. From c0ece9d2eeaaf05f3872e5fc88e94eba3198af5a Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:53:42 +0530 Subject: [PATCH 30/33] Update word_vector.rst Extended Description of Changes: Introduction Enhancement: The initial section now provides a more comprehensive overview of the module's purpose and usage. It emphasizes that the module is a valuable resource for working with pre-trained word vectors and outlines the specific NLP tasks it supports. Dependencies Clarification: The documentation explicitly mentions the dependencies required for using the module: numpy and gensim. This clarification helps users prepare their environment correctly before using the module. Function Descriptions: Each function in the module, such as doesnt_match, get_model, most_similar_cosmul, sentence_vectorizer, and similarity, is described in detail. The descriptions emphasize the practical applications of each function in NLP tasks, making it easier for users to understand how to use them effectively. WordVector Class: The introduction of the WordVector class is explained, emphasizing that it serves as a convenient interface for word vector operations. This class encapsulates key functionalities for working with pre-trained word vectors. References Inclusion: The documentation now includes a reference to the seminal work by Omer Levy and Yoav Goldberg [^OmerLevy_YoavGoldberg_2014], which is a cornerstone in the field of word representations and NLP. This reference provides users with a scholarly foundation for understanding the importance of word vectors. --- docs/api/word_vector.rst | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/docs/api/word_vector.rst b/docs/api/word_vector.rst index 2de638b6e..06385b0d9 100644 --- a/docs/api/word_vector.rst +++ b/docs/api/word_vector.rst @@ -1,26 +1,52 @@ .. currentmodule:: pythainlp.word_vector pythainlp.word_vector -==================================== +======================= The :class:`word_vector` contains functions that makes use of a pre-trained vector public data. +The `pythainlp.word_vector` module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more. Dependencies ------------- +======================= Installation of :mod:`numpy` and :mod:`gensim` is required. +Before using this module, you need to ensure that the `numpy` and `gensim` libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors. + Modules ------- - .. autofunction:: doesnt_match + :noindex: + + The `doesnt_match` function is designed to identify the word that does not match a set of words in terms of semantic similarity. It is useful for tasks like word sense disambiguation. + .. autofunction:: get_model + :noindex: + + The `get_model` function allows you to load a pre-trained word vector model, which can then be used for various word vector operations. This function serves as the entry point for accessing pre-trained word vectors. + .. autofunction:: most_similar_cosmul + :noindex: + + The `most_similar_cosmul` function finds words that are most similar to a given word in terms of cosine similarity. This function is useful for word analogy tasks and word similarity measurement. + .. autofunction:: sentence_vectorizer + :noindex: + + The `sentence_vectorizer` function takes a sentence as input and returns a vector representation of the entire sentence based on word vectors. This is valuable for document similarity and text classification tasks. + .. autofunction:: similarity + :noindex: + + The `similarity` function calculates the cosine similarity between two words based on their word vectors. It helps in measuring the semantic similarity between words. + .. autoclass:: WordVector :members: + The `WordVector` class encapsulates word vector operations and functions. It provides a convenient interface for loading models, finding word similarities, and generating sentence vectors. + References ---------- -.. [#OmerLevy_YoavGoldberg_2014] Omer Levy and Yoav Goldberg (2014). - Linguistic Regularities in Sparse and Explicit Word Representations. +- [Omer Levy and Yoav Goldberg (2014). Linguistic Regularities in Sparse and Explicit Word Representations](https://www.aclweb.org/anthology/W14-1618/) + This reference points to the work by Omer Levy and Yoav Goldberg, which discusses linguistic regularities in word representations. It underlines the theoretical foundation of word vectors and their applications in NLP. + +This enhanced documentation provides a more detailed and organized overview of the `pythainlp.word_vector` module, making it a valuable resource for NLP practitioners and researchers working with pre-trained word vectors in the Thai language. From 4f98b56a503748ac5702573b56d92141b628da86 Mon Sep 17 00:00:00 2001 From: Saharsh Jain <117359137+Saharshjain78@users.noreply.github.com> Date: Wed, 18 Oct 2023 19:00:19 +0530 Subject: [PATCH 31/33] Update wsd.rst --- docs/api/wsd.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst index d62691e5b..c152fd317 100644 --- a/docs/api/wsd.rst +++ b/docs/api/wsd.rst @@ -4,9 +4,14 @@ pythainlp.wsd ============= The :class:`pythainlp.wsd` contains get word sense function for Thai Word Sense Disambiguation (WSD). - +The `pythainlp.wsd` module is designed to assist in Word Sense Disambiguation (WSD) for the Thai language. Word Sense Disambiguation is a crucial task in natural language processing that involves determining the correct sense or meaning of a word within a given context. This module provides a function for achieving precisely that. Modules ------- +.. autofunction:: get_sense + + The `get_sense` function is the primary tool within this module for performing Word Sense Disambiguation in Thai text. Given a word and its context, this function returns the most suitable sense or meaning for that word. This is particularly useful for tasks where word sense ambiguity needs to be resolved, such as text understanding and translation. + +By using the `pythainlp.wsd` module, you can enhance the accuracy of your NLP applications when dealing with Thai text, ensuring that words are interpreted in the correct context. -.. autofunction:: get_sense \ No newline at end of file +This improved documentation offers a clear and concise explanation of the purpose of the `pythainlp.wsd` module and its primary function, `get_sense`, in the context of Word Sense Disambiguation. It helps users understand the module's utility in disambiguating word senses within the Thai language, which is valuable for a wide range of NLP applications. From 0bb5fa1207921a9fbcd3aae21b68ee99466b007f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 19 Oct 2023 16:15:00 +0700 Subject: [PATCH 32/33] Update soundex.rst --- docs/api/soundex.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst index 66ae95e07..29b00941b 100644 --- a/docs/api/soundex.rst +++ b/docs/api/soundex.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.soundex pythainlp.soundex -================ +================= The :class:`pythainlp.soundex` module provides soundex algorithms for the Thai language. Soundex is a phonetic algorithm used to encode words or names into a standardized representation based on their pronunciation, making it useful for tasks like name matching and search. Modules From 25c7b5005e481ae06c5ac4f5251fa05bb7fb7c24 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Thu, 19 Oct 2023 16:47:27 +0700 Subject: [PATCH 33/33] Post processing improve the doc --- docs/api/augment.rst | 4 +-- docs/api/benchmarks.rst | 2 +- docs/api/coref.rst | 14 ++++----- docs/api/corpus.rst | 2 +- docs/api/el.rst | 48 +++++++++++++++--------------- docs/api/generate.rst | 20 ++++++------- docs/api/khavee.rst | 60 +++++++++++++++++++------------------- docs/api/parse.rst | 22 +++++++------- docs/api/soundex.rst | 2 +- docs/api/spell.rst | 2 -- docs/api/summarize.rst | 2 +- docs/api/tag.rst | 2 +- docs/api/tokenize.rst | 2 +- docs/api/tools.rst | 2 +- docs/api/translate.rst | 2 +- docs/api/transliterate.rst | 2 +- docs/api/ulmfit.rst | 2 +- docs/api/util.rst | 2 +- docs/api/word_vector.rst | 2 +- docs/api/wsd.rst | 2 +- 20 files changed, 97 insertions(+), 99 deletions(-) diff --git a/docs/api/augment.rst b/docs/api/augment.rst index bff34aed3..39af41c18 100644 --- a/docs/api/augment.rst +++ b/docs/api/augment.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.augment -pythainlp.augment Module -======================= +pythainlp.augment +================= Introduction ------------ diff --git a/docs/api/benchmarks.rst b/docs/api/benchmarks.rst index bf9e6047a..53d0aa8c2 100644 --- a/docs/api/benchmarks.rst +++ b/docs/api/benchmarks.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.benchmarks pythainlp.benchmarks -==================================== +==================== Introduction ------------ diff --git a/docs/api/coref.rst b/docs/api/coref.rst index 9a786364e..ffffc1ba1 100644 --- a/docs/api/coref.rst +++ b/docs/api/coref.rst @@ -29,10 +29,10 @@ To use the `coreference_resolution` function effectively, follow these steps: Example: -```python -from pythainlp.coref import coreference_resolution - -text = "นาย A มาจาก กรุงเทพ และเขา มีความรักต่อ บางกิจ ของเขา" -coreferences = coreference_resolution(text) - -print(coreferences) +:: + from pythainlp.coref import coreference_resolution + + text = "นาย A มาจาก กรุงเทพ และเขา มีความรักต่อ บางกิจ ของเขา" + coreferences = coreference_resolution(text) + + print(coreferences) diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index 6c5dbf72c..fbd25822d 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.corpus pythainlp.corpus -==================================== +================ The :class:`pythainlp.corpus` module provides access to various Thai language corpora and resources that come bundled with PyThaiNLP. These resources are essential for natural language processing tasks in the Thai language. Modules diff --git a/docs/api/el.rst b/docs/api/el.rst index 36d24d1bf..6a0d17dc7 100644 --- a/docs/api/el.rst +++ b/docs/api/el.rst @@ -12,43 +12,43 @@ EntityLinker The :class:`EntityLinker` class is the core component of the `pythainlp.el` module, responsible for Thai Entity Linking. Entity Linking, also known as Named Entity Linking (NEL), plays a critical role in various applications, including question answering, information retrieval, and knowledge graph construction. -Attributes and Methods -~~~~~~~~~~~~~~~~~~~~~~ +.. Attributes and Methods +.. ~~~~~~~~~~~~~~~~~~~~~~ -The `EntityLinker` class offers the following attributes and methods: +.. The `EntityLinker` class offers the following attributes and methods: -- `__init__(text, engine="default")` - - The constructor for the `EntityLinker` class. It takes the input `text` and an optional `engine` parameter to specify the entity linking engine. The default engine is used if no specific engine is provided. +.. - `__init__(text, engine="default")` +.. - The constructor for the `EntityLinker` class. It takes the input `text` and an optional `engine` parameter to specify the entity linking engine. The default engine is used if no specific engine is provided. -- `link()` - - The `link` method performs entity linking on the input text using the specified engine. It returns a list of entities linked in the text, along with their relevant information. +.. - `link()` +.. - The `link` method performs entity linking on the input text using the specified engine. It returns a list of entities linked in the text, along with their relevant information. -- `set_engine(engine)` - - The `set_engine` method allows you to change the entity linking engine during runtime. This provides flexibility in selecting different engines for entity linking based on your specific requirements. +.. - `set_engine(engine)` +.. - The `set_engine` method allows you to change the entity linking engine during runtime. This provides flexibility in selecting different engines for entity linking based on your specific requirements. -- `get_linked_entities()` - - The `get_linked_entities` method retrieves a list of linked entities from the last entity linking operation. This is useful for extracting the entities found in the text. +.. - `get_linked_entities()` +.. - The `get_linked_entities` method retrieves a list of linked entities from the last entity linking operation. This is useful for extracting the entities found in the text. -Usage -~~~~~ +.. Usage +.. ~~~~~ -To use the `EntityLinker` class for entity linking, follow these steps: +.. To use the `EntityLinker` class for entity linking, follow these steps: -1. Initialize an `EntityLinker` object with the input text and, optionally, specify the engine. +.. 1. Initialize an `EntityLinker` object with the input text and, optionally, specify the engine. -2. Call the `link` method to perform entity linking on the text. +.. 2. Call the `link` method to perform entity linking on the text. -3. Utilize the `get_linked_entities` method to access the linked entities found in the text. +.. 3. Utilize the `get_linked_entities` method to access the linked entities found in the text. Example ~~~~~~~ Here's a simple example of how to use the `EntityLinker` class: -```python -from pythainlp.el import EntityLinker - -text = "Bangkok is the capital of Thailand." -el = EntityLinker(text) -linked_entities = el.link() -print(linked_entities) +:: + from pythainlp.el import EntityLinker + + text = "กรุงเทพเป็นเมืองหลวงของประเทศไทย" + el = EntityLinker() + linked_entities = el.get_el(text) + print(linked_entities) diff --git a/docs/api/generate.rst b/docs/api/generate.rst index d0c80580a..c6c183905 100644 --- a/docs/api/generate.rst +++ b/docs/api/generate.rst @@ -60,13 +60,13 @@ Example Here's a simple example of how to generate text using the `Unigram` class: -```python -from pythainlp.generate import Unigram - -# Initialize the Unigram model -unigram = Unigram() - -# Generate a sentence -sentence = unigram.gen_sentence(seed="สวัสดีครับ") - -print(sentence) +:: + from pythainlp.generate import Unigram + + # Initialize the Unigram model + unigram = Unigram() + + # Generate a sentence + sentence = unigram.gen_sentence("สวัสดีครับ") + + print(sentence) diff --git a/docs/api/khavee.rst b/docs/api/khavee.rst index 591ec79fd..97e7117ea 100644 --- a/docs/api/khavee.rst +++ b/docs/api/khavee.rst @@ -15,49 +15,49 @@ KhaveeVerifier The :class:`KhaveeVerifier` class is the primary component of the `pythainlp.khavee` module, dedicated to the verification of Thai poetry. It offers a range of functions and methods for analyzing and validating Thai poetry, ensuring its adherence to the rules and structure of classical Thai poetic forms. -Attributes and Methods -~~~~~~~~~~~~~~~~~~~~~~ +.. Attributes and Methods +.. ~~~~~~~~~~~~~~~~~~~~~~ -The `KhaveeVerifier` class provides a variety of attributes and methods to facilitate the verification of Thai poetry. Some of its key features include: +.. The `KhaveeVerifier` class provides a variety of attributes and methods to facilitate the verification of Thai poetry. Some of its key features include: -- `__init__(rules: dict = None, stanza_rules: dict = None, verbose: bool = False)` - - The constructor for the `KhaveeVerifier` class, allowing you to initialize an instance with custom rules, stanza rules, and verbosity settings. +.. - `__init__(rules: dict = None, stanza_rules: dict = None, verbose: bool = False)` +.. - The constructor for the `KhaveeVerifier` class, allowing you to initialize an instance with custom rules, stanza rules, and verbosity settings. -- `is_khavee(text: str, rules: dict = None)` - - The `is_khavee` method checks whether a given text conforms to the rules of Thai poetry. It returns `True` if the text is a valid Thai poem according to the specified rules, and `False` otherwise. +.. - `is_khavee(text: str, rules: dict = None)` +.. - The `is_khavee` method checks whether a given text conforms to the rules of Thai poetry. It returns `True` if the text is a valid Thai poem according to the specified rules, and `False` otherwise. -- `get_rules()` - - The `get_rules` method retrieves the current set of rules being used by the verifier. This is helpful for inspecting and modifying the rules during runtime. +.. - `get_rules()` +.. - The `get_rules` method retrieves the current set of rules being used by the verifier. This is helpful for inspecting and modifying the rules during runtime. -- `set_rules(rules: dict)` - - The `set_rules` method allows you to set custom rules for the verifier, offering flexibility in defining specific constraints for Thai poetry. +.. - `set_rules(rules: dict)` +.. - The `set_rules` method allows you to set custom rules for the verifier, offering flexibility in defining specific constraints for Thai poetry. -Usage -~~~~~ +.. Usage +.. ~~~~~ -To use the `KhaveeVerifier` class for Thai poetry verification, follow these steps: +.. To use the `KhaveeVerifier` class for Thai poetry verification, follow these steps: -1. Initialize an instance of the `KhaveeVerifier` class, optionally specifying custom rules and verbosity settings. +.. 1. Initialize an instance of the `KhaveeVerifier` class, optionally specifying custom rules and verbosity settings. -2. Use the `is_khavee` method to verify whether a given text adheres to the rules of Thai poetry. The method returns a Boolean value indicating the result. +.. 2. Use the `is_khavee` method to verify whether a given text adheres to the rules of Thai poetry. The method returns a Boolean value indicating the result. -3. Utilize the `get_rules` and `set_rules` methods to inspect and modify the rules as needed. +.. 3. Utilize the `get_rules` and `set_rules` methods to inspect and modify the rules as needed. Example ~~~~~~~ Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry: -```python -from pythainlp.khavee import KhaveeVerifier - -# Initialize a KhaveeVerifier instance -verifier = KhaveeVerifier() - -# Text to verify -poem_text = "ดอกไม้สวยงาม แสนสดใส" - -# Verify if the text is Thai poetry -is_poetry = verifier.is_khavee(poem_text) - -print(f"The provided text is Thai poetry: {is_poetry}") +:: + from pythainlp.khavee import KhaveeVerifier + + # Initialize a KhaveeVerifier instance + verifier = KhaveeVerifier() + + # Text to verify + poem_text = "ดอกไม้สวยงาม แสนสดใส" + + # Verify if the text is Thai poetry + is_poetry = verifier.is_khavee(poem_text) + + print(f"The provided text is Thai poetry: {is_poetry}") diff --git a/docs/api/parse.rst b/docs/api/parse.rst index 93bb4d552..52c79300d 100644 --- a/docs/api/parse.rst +++ b/docs/api/parse.rst @@ -27,14 +27,14 @@ Example Here's a basic example of how to use the `dependency_parsing` function: -```python -from pythainlp.parse import dependency_parsing - -# Input Thai sentence -sentence = "พี่น้องชาวบ้านกำลังเลี้ยงสตางค์ในสวน" - -# Perform dependency parsing -parsing_result = dependency_parsing(sentence) - -# Print the parsing result -print(parsing_result) +:: + from pythainlp.parse import dependency_parsing + + # Input Thai sentence + sentence = "พี่น้องชาวบ้านกำลังเลี้ยงสตางค์ในสวน" + + # Perform dependency parsing + parsing_result = dependency_parsing(sentence) + + # Print the parsing result + print(parsing_result) diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst index 29b00941b..27bb384ad 100644 --- a/docs/api/soundex.rst +++ b/docs/api/soundex.rst @@ -66,4 +66,4 @@ References .. [#prayut_and_somchaip] Prayut Suwanvisat, Somchai Prasitjutrakul. Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf. -This enhanced documentation provides clear descriptions of all the modules within the `pythainlp.soundex` module, including their purposes and functionalities. Users can now better understand how to leverage these soundex algorithms for various phonetic matching tasks in the Thai language. +.. This enhanced documentation provides clear descriptions of all the modules within the `pythainlp.soundex` module, including their purposes and functionalities. Users can now better understand how to leverage these soundex algorithms for various phonetic matching tasks in the Thai language. \ No newline at end of file diff --git a/docs/api/spell.rst b/docs/api/spell.rst index c28fca95e..fa36e5779 100644 --- a/docs/api/spell.rst +++ b/docs/api/spell.rst @@ -50,5 +50,3 @@ References ---------- .. [#norvig_spellchecker] Peter Norvig (2007). `How to Write a Spelling Corrector `_. - -This enhanced documentation provides a clear introduction to the `pythainlp.spell` module, its purpose, and the functionalities it offers for Thai text spell-checking. It also includes detailed descriptions of the functions and classes, their purposes, and how to use them effectively. Users can now understand how to leverage this module for spell-checking and text correction in the Thai language. If you have any questions or need further assistance, please refer to the PyThaiNLP documentation or reach out to the PyThaiNLP community for support. diff --git a/docs/api/summarize.rst b/docs/api/summarize.rst index 6e067966f..42a5043ef 100644 --- a/docs/api/summarize.rst +++ b/docs/api/summarize.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.summarize pythainlp.summarize -==================================== +=================== The :class:`summarize` is Thai text summarizer. Modules diff --git a/docs/api/tag.rst b/docs/api/tag.rst index c7d471037..437de7a06 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.tag pythainlp.tag -===================================== +============= The :class:`pythainlp.tag` contains functions that are used to mark linguistic and other annotation to different parts of a text including part-of-speech (POS) tags and named entity (NE) tags. diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 4dc9493e6..1f42ab128 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -2,7 +2,7 @@ .. _tokenize-doc: pythainlp.tokenize -===================================== +================== The :mod:`pythainlp.tokenize` module contains a comprehensive set of functions and classes for tokenizing Thai text into various units, such as sentences, words, subwords, and more. This module is a fundamental component of the PyThaiNLP library, providing tools for natural language processing in the Thai language. Modules diff --git a/docs/api/tools.rst b/docs/api/tools.rst index f852f010f..8b31ecc90 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.tools pythainlp.tools -==================================== +=============== The :mod:`pythainlp.tools` module encompasses a collection of miscellaneous functions primarily designed for internal use within the PyThaiNLP library. While these functions may not be directly exposed for external use, understanding their purpose can offer insights into the inner workings of PyThaiNLP. Modules diff --git a/docs/api/translate.rst b/docs/api/translate.rst index 5bb252bbd..bd5ec4a4c 100644 --- a/docs/api/translate.rst +++ b/docs/api/translate.rst @@ -42,4 +42,4 @@ Modules Lastly, the `ThFrTranslator` class specializes in translating text from Thai to French. It serves as a tool for expanding language accessibility and promoting content sharing in French-speaking communities. -The `pythainlp.translate` module extends the language processing capabilities of PyThaiNLP, offering machine translation functionality for various language pairs. Whether you need to translate text between English and Thai, Thai and Chinese, or Thai and French, this module provides the necessary tools and classes to facilitate seamless language conversion. The `Translate` class acts as the central coordinator, while language-specific classes ensure accurate and meaningful translations for diverse linguistic scenarios. +.. The `pythainlp.translate` module extends the language processing capabilities of PyThaiNLP, offering machine translation functionality for various language pairs. Whether you need to translate text between English and Thai, Thai and Chinese, or Thai and French, this module provides the necessary tools and classes to facilitate seamless language conversion. The `Translate` class acts as the central coordinator, while language-specific classes ensure accurate and meaningful translations for diverse linguistic scenarios. diff --git a/docs/api/transliterate.rst b/docs/api/transliterate.rst index e95c9dca1..8f832fbad 100644 --- a/docs/api/transliterate.rst +++ b/docs/api/transliterate.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.transliterate pythainlp.transliterate -==================================== +======================= The :mod:`pythainlp.transliterate` module is dedicated to the transliteration of Thai text into romanized form, effectively spelling it out with the English alphabet. This functionality is invaluable for making Thai text more accessible to non-Thai speakers and for various language processing tasks. Modules diff --git a/docs/api/ulmfit.rst b/docs/api/ulmfit.rst index 1c65e4b01..d9b185649 100644 --- a/docs/api/ulmfit.rst +++ b/docs/api/ulmfit.rst @@ -87,4 +87,4 @@ Modules The `ungroup_emoji` function is designed for ungrouping emojis in text data, which can be crucial for emoji recognition and classification tasks. -The `pythainlp.ulmfit` module provides a comprehensive set of tools for ULMFiT-based text classification. Whether you need to preprocess Thai text, tokenize it, compute document vectors, or perform various text cleaning tasks, this module has the utilities you need. ULMFiT is a state-of-the-art technique in NLP, and these tools empower you to use it effectively for text classification. +.. The `pythainlp.ulmfit` module provides a comprehensive set of tools for ULMFiT-based text classification. Whether you need to preprocess Thai text, tokenize it, compute document vectors, or perform various text cleaning tasks, this module has the utilities you need. ULMFiT is a state-of-the-art technique in NLP, and these tools empower you to use it effectively for text classification. diff --git a/docs/api/util.rst b/docs/api/util.rst index f8a9ed40d..d3773a9c1 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -1,7 +1,7 @@ .. currentmodule:: pythainlp.util pythainlp.util -===================================== +============== The :mod:`pythainlp.util` module serves as a treasure trove of utility functions designed to aid text conversion, formatting, and various language processing tasks in the context of Thai language. Modules diff --git a/docs/api/word_vector.rst b/docs/api/word_vector.rst index 06385b0d9..3550af5a1 100644 --- a/docs/api/word_vector.rst +++ b/docs/api/word_vector.rst @@ -6,7 +6,7 @@ The :class:`word_vector` contains functions that makes use of a pre-trained vect The `pythainlp.word_vector` module is a valuable resource for working with pre-trained word vectors. These word vectors are trained on large corpora and can be used for various natural language processing tasks, such as word similarity, document similarity, and more. Dependencies -======================= +------------ Installation of :mod:`numpy` and :mod:`gensim` is required. Before using this module, you need to ensure that the `numpy` and `gensim` libraries are installed in your environment. These libraries are essential for loading and working with the pre-trained word vectors. diff --git a/docs/api/wsd.rst b/docs/api/wsd.rst index 0fe563cd2..d260fafa3 100644 --- a/docs/api/wsd.rst +++ b/docs/api/wsd.rst @@ -13,4 +13,4 @@ Modules By using the `pythainlp.wsd` module, you can enhance the accuracy of your NLP applications when dealing with Thai text, ensuring that words are interpreted in the correct context. -This improved documentation offers a clear and concise explanation of the purpose of the `pythainlp.wsd` module and its primary function, `get_sense`, in the context of Word Sense Disambiguation. It helps users understand the module's utility in disambiguating word senses within the Thai language, which is valuable for a wide range of NLP applications. +.. This improved documentation offers a clear and concise explanation of the purpose of the `pythainlp.wsd` module and its primary function, `get_sense`, in the context of Word Sense Disambiguation. It helps users understand the module's utility in disambiguating word senses within the Thai language, which is valuable for a wide range of NLP applications.