From a8dfcfc78e5e93fae0a413d46bb328e040aa70c2 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 17:57:07 +0900 Subject: [PATCH 01/11] Improve: Change regular expression of "non-thai-characters" Before: directly descript non-thai-characters by rule-based After: Just set as "anything except Thai-characters" --- pythainlp/tokenize/newmm.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 708c5efdd..1ca2bdc90 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -36,14 +36,7 @@ from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens -_PAT_NONTHAI = re.compile( - r"""(?x) -[-a-zA-Z]+| # Latin characters -\d+([,\.]\d+)*| # numbers -[ \t]+| # spaces -\r?\n # newlines -""" -) +_PAT_NONTHAI = re.compile(r"[^\u0E00-\u0E7F]+") # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From 801518c466f753ad72363b10c85ea72be2cfa57b Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 1 Nov 2023 17:19:21 +0700 Subject: [PATCH 02/11] Update newmm.py --- pythainlp/tokenize/newmm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 1ca2bdc90..1d070bad7 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -36,7 +36,8 @@ from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens -_PAT_NONTHAI = re.compile(r"[^\u0E00-\u0E7F]+") +# number, (space,whitespace) and non-Thai tokens +_PAT_NONTHAI = re.compile(r"\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+") # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From 24f8a4a361b2623f4be55508fcb552c17734c434 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 1 Nov 2023 17:26:13 +0700 Subject: [PATCH 03/11] Update newmm.py --- pythainlp/tokenize/newmm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 1d070bad7..c8a701697 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -36,8 +36,8 @@ from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens -# number, (space,whitespace) and non-Thai tokens -_PAT_NONTHAI = re.compile(r"\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+") +# English, number, (space,whitespace) and non-Thai tokens +_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+|") # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From ceac7635255b99b0eb9dff90705eb372b277dd6e Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 20:43:04 +0900 Subject: [PATCH 04/11] Fix: exclude Thai characters --- pythainlp/tokenize/newmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index c8a701697..48a43c90c 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -37,7 +37,7 @@ # match non-Thai tokens # English, number, (space,whitespace) and non-Thai tokens -_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+|") +_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+") # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From d94a225a86514043689e651f96b0267995597e0c Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 20:46:06 +0900 Subject: [PATCH 05/11] Improve: comment regex intention For further mentenance easier --- pythainlp/tokenize/newmm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 48a43c90c..950f9d6a4 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -37,6 +37,8 @@ # match non-Thai tokens # English, number, (space,whitespace) and non-Thai tokens +# `|` is used as like "early return", +# which divide "abc123" to "abc", "123" for example. _PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+") # match 2-consonant Thai tokens From f5fa4976c3b7b8ca4ac270fd59e18c65adb08c0f Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 20:50:24 +0900 Subject: [PATCH 06/11] Refac: Make regex easier to read --- pythainlp/tokenize/newmm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 950f9d6a4..6860af7d5 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -36,11 +36,17 @@ from pythainlp.tokenize.tcc_p import tcc_pos # match non-Thai tokens -# English, number, (space,whitespace) and non-Thai tokens # `|` is used as like "early return", # which divide "abc123" to "abc", "123" for example. -_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+") - +_PAT_NONTHAI = re.compile( + r"""(?x) +[-a-zA-Z]+| # Latin characters +\d+([,\.]\d+)*| # numbers +[ \t]+| # spaces +\r?\n # newlines +|[^\u0E00-\u0E7F]+ # other non-Thai characters +""") + # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From 30d60f6ec2c720d1291e03ba74630d5d1701d026 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 21:02:11 +0900 Subject: [PATCH 07/11] Fix: fix to PEP 8 style --- pythainlp/tokenize/newmm.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 6860af7d5..66c4583e4 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -37,16 +37,17 @@ # match non-Thai tokens # `|` is used as like "early return", -# which divide "abc123" to "abc", "123" for example. +# which divides "abc123" to "abc", "123" for example. _PAT_NONTHAI = re.compile( - r"""(?x) +r"""(?x) [-a-zA-Z]+| # Latin characters \d+([,\.]\d+)*| # numbers [ \t]+| # spaces \r?\n # newlines |[^\u0E00-\u0E7F]+ # other non-Thai characters -""") - +""" +) + # match 2-consonant Thai tokens _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$") From fb3e7bb8053184f4ff2772d10ca669b6ac84dec7 Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 21:06:03 +0900 Subject: [PATCH 08/11] Refac: unify position of | --- pythainlp/tokenize/newmm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 66c4583e4..3a7caac5f 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -43,8 +43,8 @@ [-a-zA-Z]+| # Latin characters \d+([,\.]\d+)*| # numbers [ \t]+| # spaces -\r?\n # newlines -|[^\u0E00-\u0E7F]+ # other non-Thai characters +\r?\n| # newlines +[^\u0E00-\u0E7F]+ # other non-Thai characters """ ) From 02e9cb5f040cdbb2df3b86cfba40f3467e063897 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 1 Nov 2023 19:52:12 +0700 Subject: [PATCH 09/11] Update other non-Thai characters in newmm --- pythainlp/tokenize/newmm.py | 2 +- tests/test_tokenize.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 3a7caac5f..673d7ce57 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -44,7 +44,7 @@ \d+([,\.]\d+)*| # numbers [ \t]+| # spaces \r?\n| # newlines -[^\u0E00-\u0E7F]+ # other non-Thai characters +[^\u0E00-\u0E7F \t]+ # other non-Thai characters """ ) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index c224a675c..1537b62c9 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -653,6 +653,15 @@ def test_newmm(self): keep_whitespace=False, ) ) + self.assertEqual( + word_tokenize("(คนไม่เอา)", engine="newmm"), ['(', 'คน', 'ไม่', 'เอา', ')'] + ) + self.assertEqual( + word_tokenize("กม/ชม", engine="newmm"), ['กม', '/', 'ชม'] + ) + self.assertEqual( + word_tokenize("สีหน้า(รถ)", engine="newmm"), ['สีหน้า', '(', 'รถ', ')'] + ) def test_newmm_longtext(self): self.assertIsInstance( From 3d889f7be4cf90c30bc64285f7fdcdb6ba369656 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 1 Nov 2023 19:57:07 +0700 Subject: [PATCH 10/11] Update newmm.py --- pythainlp/tokenize/newmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 673d7ce57..42a2d898b 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -44,7 +44,7 @@ \d+([,\.]\d+)*| # numbers [ \t]+| # spaces \r?\n| # newlines -[^\u0E00-\u0E7F \t]+ # other non-Thai characters +[^\u0E00-\u0E7F \t\r\n]+ # other non-Thai characters """ ) From 2e2f0cf7e86b3bc39cf094c2a154a2732bc3675d Mon Sep 17 00:00:00 2001 From: Konbraphat <101827492+konbraphat51@users.noreply.github.com> Date: Wed, 1 Nov 2023 22:08:35 +0900 Subject: [PATCH 11/11] Refac: comment for addition Interntion for ` \t\r\n` --- pythainlp/tokenize/newmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index 42a2d898b..e19272989 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -44,7 +44,7 @@ \d+([,\.]\d+)*| # numbers [ \t]+| # spaces \r?\n| # newlines -[^\u0E00-\u0E7F \t\r\n]+ # other non-Thai characters +[^\u0E00-\u0E7F \t\r\n]+ # other non-Thai characters, and stops matching until space/newline """ )