From e6ea7e2b39fe06dc1e2c301fc233df27dee6e306 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 4 Nov 2024 07:16:06 +0000 Subject: [PATCH 1/2] Add nlpo3 to compact - Add relevant nlpo3 test to testc - Add notes on tests in main readme - Up nlpo3 version to 1.3.0 (fixed karan bug) --- README.md | 17 ++++- requirements.txt | 1 + setup.py | 5 +- tests/README.md | 8 +-- tests/compact/testc_tokenize.py | 31 ++++++---- tests/core/test_tokenize.py | 106 ++++++++++++++++---------------- tests/extra/testx_tokenize.py | 4 +- 7 files changed, 96 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 1e81c9033..5480cb866 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ pip install pythainlp[extra1,extra2,...] Possible `extras`: - `full` (install everything) +- `compact` (install a stable and small subset of dependencies) - `attacut` (to support attacut, a fast and accurate tokenizer) - `benchmarks` (for [word tokenization benchmarking](tokenization-benchmark.md)) - `icu` (for ICU, International Components for Unicode, support in transliteration and tokenization) @@ -85,7 +86,8 @@ Possible `extras`: - `thai2rom` (for machine-learnt romanization) - `wordnet` (for Thai WordNet API) -For dependency details, look at the `extras` variable in [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py). +For dependency details, look at the `extras` variable in +[`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py). ## Data Directory @@ -110,6 +112,19 @@ To show how to use: thainlp help ``` +## Testing and test suites + +We test core functionalities on all officially supported Python versions. + +Some functionality requiring extra dependencies may be tested less frequently +due to potential version conflicts or incompatibilities between packages. + +Test cases are categorized into three groups: core, compact, and extra. +You can find these tests in the [tests/](/tests/) directory. + +For more detailed information on testing, please refer to the tests README: +[tests/README.md](./tests/README.md) + ## Licenses | | License | diff --git a/requirements.txt b/requirements.txt index 4000dc69e..4bdc940c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ PyYAML>=5.4.1 +nlpo3>=1.3.0 numpy>=1.22 pyicu>=2.3 python-crfsuite>=0.9.7 diff --git a/setup.py b/setup.py index 65c2d2151..f03161e5f 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ "ipa": ["epitran>=1.1"], "ml": ["numpy>=1.22", "torch>=1.0.0"], "mt5": ["sentencepiece>=0.1.91", "transformers>=4.6.0"], - "nlpo3": ["nlpo3>=1.2.2"], + "nlpo3": ["nlpo3>=1.3.0"], "onnx": ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"], "oskut": ["oskut>=1.3"], "sefr_cut": ["sefr_cut>=1.1"], @@ -103,6 +103,7 @@ # Compact dependencies, this one matches requirements.txt "compact": [ "PyYAML>=5.4.1", + "nlpo3>=1.3.0", "numpy>=1.22", "pyicu>=2.3", "python-crfsuite>=0.9.7", @@ -119,7 +120,7 @@ "fastcoref>=2.1.5", "gensim>=4.0.0", "khamyo>=0.2.0", - "nlpo3>=1.2.2", + "nlpo3>=1.3.0", "nltk>=3.3", "numpy>=1.22", "onnxruntime>=1.10.0", diff --git a/tests/README.md b/tests/README.md index 31d8f5564..2a88e0a21 100644 --- a/tests/README.md +++ b/tests/README.md @@ -14,10 +14,10 @@ Tests are categorized into three groups: core, compact, and extra. ## Compact Tests (testc_*.py) - Run `unittest tests.compact` -- Test a limited set of additional functionalities that rely on optional - dependencies specified in `requirements.txt`. -- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and - `requests`. +- Test a limited set of functionalities that rely on a stable and small subset + of optional dependencies specified in `requirements.txt`. +- These dependencies are `PyYAML`, `nlpo3`, `numpy`, `pyicu`, + `python-crfsuite`, and `requests`. - Test with the latest two stable Python versions. ## Extra Tests (testx_*.py) diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py index c04837550..0f3768773 100644 --- a/tests/compact/testc_tokenize.py +++ b/tests/compact/testc_tokenize.py @@ -25,19 +25,6 @@ ) -class WordTokenizeICUTestCase(unittest.TestCase): - def test_icu(self): - self.assertEqual(pyicu.segment(None), []) - self.assertEqual(pyicu.segment(""), []) - self.assertEqual( - word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), - ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], - ) - - def test_word_tokenize_icu(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu")) - - class SentTokenizeCRFCutTestCase(unittest.TestCase): def test_sent_tokenize(self): # Use default engine (crfcut) @@ -88,3 +75,21 @@ def test_subword_tokenize(self): self.assertNotIn( "า", subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo") ) + + +class WordTokenizeICUTestCase(unittest.TestCase): + def test_icu(self): + self.assertEqual(pyicu.segment(None), []) + self.assertEqual(pyicu.segment(""), []) + self.assertEqual( + word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), + ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], + ) + + def test_word_tokenize_icu(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu")) + + +class WordTokenizeNlpO3TestCase(unittest.TestCase): + def test_word_tokenize_nlpo3(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 393b02741..29463b10b 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -204,6 +204,59 @@ SENT_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"] +class DetokenizeTestCase(unittest.TestCase): + """Detokenize and regrouping test cases""" + + def test_word_detokenize(self): + self.assertIsInstance(word_detokenize(["ผม", "5"]), str) + self.assertEqual( + word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" + ) + self.assertEqual( + word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), + [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]], + ) + self.assertEqual( + word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]), + "ผมเลี้ยง 5 10 ตัว ๆ คนดี", + ) + self.assertEqual( + word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]), + "ผมเลี้ยง 5 ตัว ๆ คนดี", + ) + self.assertEqual( + word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]), + "ม่ายย ผมเลี้ยง 5 ตัว", + ) + + def test_numeric_data_format(self): + engines = ["newmm"] + + for engine in engines: + self.assertIn( + "127.0.0.1", + word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), + ) + + tokens = word_tokenize( + "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine + ) + self.assertTrue( + any(value in tokens for value in ["12:12pm", "12:12"]), + msg=f"{engine}: {tokens}", + ) + self.assertIn("11.11", tokens) + + self.assertIn( + "1,234,567.89", + word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), + ) + + tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) + self.assertIn("2.5:1", tokens) + self.assertIn("5:2", tokens) + + class TokenizeTestCase(unittest.TestCase): def test_Tokenizer(self): _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE) @@ -550,56 +603,3 @@ def test_tcc_p(self): # ) self.assertEqual(list(tcc_p.tcc("")), []) self.assertEqual(tcc_p.tcc_pos(""), set()) - - -class DetokenizeTestCase(unittest.TestCase): - """Detokenize and regrouping test cases""" - - def test_word_detokenize(self): - self.assertIsInstance(word_detokenize(["ผม", "5"]), str) - self.assertEqual( - word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว" - ) - self.assertEqual( - word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"), - [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]], - ) - self.assertEqual( - word_detokenize(["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]), - "ผมเลี้ยง 5 10 ตัว ๆ คนดี", - ) - self.assertEqual( - word_detokenize(["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]), - "ผมเลี้ยง 5 ตัว ๆ คนดี", - ) - self.assertEqual( - word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]), - "ม่ายย ผมเลี้ยง 5 ตัว", - ) - - def test_numeric_data_format(self): - engines = ["newmm"] - - for engine in engines: - self.assertIn( - "127.0.0.1", - word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine), - ) - - tokens = word_tokenize( - "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine - ) - self.assertTrue( - any(value in tokens for value in ["12:12pm", "12:12"]), - msg=f"{engine}: {tokens}", - ) - self.assertIn("11.11", tokens) - - self.assertIn( - "1,234,567.89", - word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine), - ) - - tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine) - self.assertIn("2.5:1", tokens) - self.assertIn("5:2", tokens) diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index 0e9f05737..a86be6b87 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -306,9 +306,7 @@ def test_nercut(self): self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) -class WordTokenizeNlpO3TestCase(unittest.TestCase): - def test_word_tokenize_nlpo3(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) + class WordTokenizeOSKutTestCase(unittest.TestCase): From 5c48bb0caff3747b7404a5c9f690a61c1c7b7ddd Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 4 Nov 2024 07:30:25 +0000 Subject: [PATCH 2/2] Remove nlpo3 from compact Cannot build on Python 3.12 and 3.13 --- CONTRIBUTING.md | 11 +++++++---- README_TH.md | 6 ++++-- requirements.txt | 1 - setup.py | 1 - tests/README.md | 2 +- tests/compact/testc_tokenize.py | 5 ----- tests/extra/testx_tokenize.py | 4 +++- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c9fa48b5c..ecbcda896 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,7 @@ so it may be a good idea to familiarize yourself with it. - We use the famous [gitflow](http://nvie.com/posts/a-successful-git-branching-model/) to manage our branches. -- When you create pull requests on GitHub, Github Actions and AppVeyor will run tests +- When you create pull requests on GitHub, GitHub Actions will run tests and several checks automatically. Click the "Details" link at the end of each check to see what needs to be fixed. @@ -66,7 +66,7 @@ To run unit tests locally together with code coverage test: (from main `pythainlp/` directory) ```sh -coverage run -m unittest discover +coverage run -m unittest tests.core ``` See code coverage test: @@ -75,13 +75,16 @@ See code coverage test: coverage report ``` -Generate code coverage test in HTML (files will be available in `htmlcov/` directory): +Generate code coverage test in HTML +(files will be available in `htmlcov/` directory): ```sh coverage html ``` -Make sure the tests pass on both Github Actions and AppVeyor. +Make sure the tests pass on GitHub Actions. + +See more in [tests/README.md](./tests/README.md) ## Releasing diff --git a/README_TH.md b/README_TH.md index fcf14e0b6..07a1bca23 100644 --- a/README_TH.md +++ b/README_TH.md @@ -2,10 +2,9 @@

PyThaiNLP: Thai Natural Language Processing in Python

pypi - Python 3.7 + Python 3.9 License Download - Build status Coverage Status Codacy Badge FOSSA Status @@ -51,6 +50,7 @@ PyThaiNLP มีความสามารถพื้นฐานสำหร - Thai datetime formatting (`thai_strftime`) - Thai-English keyboard misswitched fix (`eng_to_thai`, `thai_to_eng`) - Command-line interface for basic functions, like tokenization and pos tagging (run `thainlp` in your shell) + อ่านรายละเอียดได้ที่ [tutorials](https://pythainlp.org/tutorials) @@ -82,6 +82,7 @@ pip install pythainlp[extra1,extra2,...] รายการสำหรับติดตั้งผ่าน extras - `full` (ติดตั้งทุกอย่าง) +- `compact` (ติดตั้งไลบารีชุดเล็กที่ทดสอบแล้วว่าไม่ตีกันเองและติดตั้งได้ในทุกระบบปฏิบัติการ) - `attacut` (เพื่อสนับสนุน attacut ซึ่งเป็นตัวตัดคำที่ทำงานได้รวดเร็วและมีประสิทธิภาพ) - `benchmarks` (สำหรับ [word tokenization benchmarking](tokenization-benchmark.md)) - `icu` (สำหรับการรองรับ ICU หรือ International Components for Unicode ในการถอดเสียงเป็นอักษรและการตัดแบ่งคำ) @@ -90,6 +91,7 @@ pip install pythainlp[extra1,extra2,...] - `thai2fit` (สำหรับ Thai word vector) - `thai2rom` (สำหรับการถอดอักษรไทยเป็นอักษรโรมัน) - `wordnet` (สำหรับ Thai WordNet API) + สำหรับโมดูลที่ต้องการ สามารถดูรายละเอียดได้ที่ตัวแปร `extras` ใน [`setup.py`](https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py). diff --git a/requirements.txt b/requirements.txt index 4bdc940c1..4000dc69e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ PyYAML>=5.4.1 -nlpo3>=1.3.0 numpy>=1.22 pyicu>=2.3 python-crfsuite>=0.9.7 diff --git a/setup.py b/setup.py index f03161e5f..806d8a6c8 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,6 @@ # Compact dependencies, this one matches requirements.txt "compact": [ "PyYAML>=5.4.1", - "nlpo3>=1.3.0", "numpy>=1.22", "pyicu>=2.3", "python-crfsuite>=0.9.7", diff --git a/tests/README.md b/tests/README.md index 2a88e0a21..1beb0e2ec 100644 --- a/tests/README.md +++ b/tests/README.md @@ -16,7 +16,7 @@ Tests are categorized into three groups: core, compact, and extra. - Run `unittest tests.compact` - Test a limited set of functionalities that rely on a stable and small subset of optional dependencies specified in `requirements.txt`. -- These dependencies are `PyYAML`, `nlpo3`, `numpy`, `pyicu`, +- These dependencies are `PyYAML`, `numpy`, `pyicu`, `python-crfsuite`, and `requests`. - Test with the latest two stable Python versions. diff --git a/tests/compact/testc_tokenize.py b/tests/compact/testc_tokenize.py index 0f3768773..44d02f3f1 100644 --- a/tests/compact/testc_tokenize.py +++ b/tests/compact/testc_tokenize.py @@ -88,8 +88,3 @@ def test_icu(self): def test_word_tokenize_icu(self): self.assertIsNotNone(word_tokenize(TEXT_1, engine="icu")) - - -class WordTokenizeNlpO3TestCase(unittest.TestCase): - def test_word_tokenize_nlpo3(self): - self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index a86be6b87..0e9f05737 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -306,7 +306,9 @@ def test_nercut(self): self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut")) - +class WordTokenizeNlpO3TestCase(unittest.TestCase): + def test_word_tokenize_nlpo3(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="nlpo3")) class WordTokenizeOSKutTestCase(unittest.TestCase):