From a8dfcfc78e5e93fae0a413d46bb328e040aa70c2 Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 17:57:07 +0900
Subject: [PATCH 01/11] Improve: Change regular expression of
 "non-thai-characters"

Before: directly descript non-thai-characters by rule-based
After: Just set as "anything except Thai-characters"
---
 pythainlp/tokenize/newmm.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 708c5efdd..1ca2bdc90 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -36,14 +36,7 @@
 from pythainlp.tokenize.tcc_p import tcc_pos
 
 # match non-Thai tokens
-_PAT_NONTHAI = re.compile(
-    r"""(?x)
-[-a-zA-Z]+|        # Latin characters
-\d+([,\.]\d+)*|    # numbers
-[ \t]+|            # spaces
-\r?\n              # newlines
-"""
-)
+_PAT_NONTHAI = re.compile(r"[^\u0E00-\u0E7F]+")
 
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")

From 801518c466f753ad72363b10c85ea72be2cfa57b Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 1 Nov 2023 17:19:21 +0700
Subject: [PATCH 02/11] Update newmm.py

---
 pythainlp/tokenize/newmm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 1ca2bdc90..1d070bad7 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -36,7 +36,8 @@
 from pythainlp.tokenize.tcc_p import tcc_pos
 
 # match non-Thai tokens
-_PAT_NONTHAI = re.compile(r"[^\u0E00-\u0E7F]+")
+# number, (space,whitespace) and non-Thai tokens
+_PAT_NONTHAI = re.compile(r"\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+")
 
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")

From 24f8a4a361b2623f4be55508fcb552c17734c434 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 1 Nov 2023 17:26:13 +0700
Subject: [PATCH 03/11] Update newmm.py

---
 pythainlp/tokenize/newmm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 1d070bad7..c8a701697 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -36,8 +36,8 @@
 from pythainlp.tokenize.tcc_p import tcc_pos
 
 # match non-Thai tokens
-# number, (space,whitespace) and non-Thai tokens
-_PAT_NONTHAI = re.compile(r"\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+")
+# English, number, (space,whitespace) and non-Thai tokens
+_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+|")
 
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")

From ceac7635255b99b0eb9dff90705eb372b277dd6e Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 20:43:04 +0900
Subject: [PATCH 04/11] Fix: exclude Thai characters

---
 pythainlp/tokenize/newmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index c8a701697..48a43c90c 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -37,7 +37,7 @@
 
 # match non-Thai tokens
 # English, number, (space,whitespace) and non-Thai tokens
-_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+|")
+_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+")
 
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")

From d94a225a86514043689e651f96b0267995597e0c Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 20:46:06 +0900
Subject: [PATCH 05/11] Improve: comment regex intention

For further mentenance easier
---
 pythainlp/tokenize/newmm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 48a43c90c..950f9d6a4 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -37,6 +37,8 @@
 
 # match non-Thai tokens
 # English, number, (space,whitespace) and non-Thai tokens
+# `|` is used as like "early return",
+# which divide "abc123" to "abc", "123" for example.
 _PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+")
 
 # match 2-consonant Thai tokens

From f5fa4976c3b7b8ca4ac270fd59e18c65adb08c0f Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 20:50:24 +0900
Subject: [PATCH 06/11] Refac: Make regex easier to read

---
 pythainlp/tokenize/newmm.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 950f9d6a4..6860af7d5 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -36,11 +36,17 @@
 from pythainlp.tokenize.tcc_p import tcc_pos
 
 # match non-Thai tokens
-# English, number, (space,whitespace) and non-Thai tokens
 # `|` is used as like "early return",
 # which divide "abc123" to "abc", "123" for example.
-_PAT_NONTHAI = re.compile(r"[-a-zA-Z]+|\d+([,\.]\d+)*|[ \t]+|\r?\n|[^\u0E00-\u0E7F]+")
-
+_PAT_NONTHAI = re.compile(
+   r"""(?x)
+[-a-zA-Z]+|        # Latin characters
+\d+([,\.]\d+)*|    # numbers
+[ \t]+|            # spaces
+\r?\n              # newlines
+|[^\u0E00-\u0E7F]+ # other non-Thai characters
+""")
+    
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
 

From 30d60f6ec2c720d1291e03ba74630d5d1701d026 Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 21:02:11 +0900
Subject: [PATCH 07/11] Fix: fix to PEP 8 style

---
 pythainlp/tokenize/newmm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 6860af7d5..66c4583e4 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -37,16 +37,17 @@
 
 # match non-Thai tokens
 # `|` is used as like "early return",
-# which divide "abc123" to "abc", "123" for example.
+# which divides "abc123" to "abc", "123" for example.
 _PAT_NONTHAI = re.compile(
-   r"""(?x)
+r"""(?x)
 [-a-zA-Z]+|        # Latin characters
 \d+([,\.]\d+)*|    # numbers
 [ \t]+|            # spaces
 \r?\n              # newlines
 |[^\u0E00-\u0E7F]+ # other non-Thai characters
-""")
-    
+"""
+)
+
 # match 2-consonant Thai tokens
 _PAT_THAI_TWOCHARS = re.compile("[ก-ฮ]{,2}$")
 

From fb3e7bb8053184f4ff2772d10ca669b6ac84dec7 Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 21:06:03 +0900
Subject: [PATCH 08/11] Refac: unify position of |

---
 pythainlp/tokenize/newmm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 66c4583e4..3a7caac5f 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -43,8 +43,8 @@
 [-a-zA-Z]+|        # Latin characters
 \d+([,\.]\d+)*|    # numbers
 [ \t]+|            # spaces
-\r?\n              # newlines
-|[^\u0E00-\u0E7F]+ # other non-Thai characters
+\r?\n|             # newlines
+[^\u0E00-\u0E7F]+  # other non-Thai characters
 """
 )
 

From 02e9cb5f040cdbb2df3b86cfba40f3467e063897 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 1 Nov 2023 19:52:12 +0700
Subject: [PATCH 09/11] Update other non-Thai characters in newmm

---
 pythainlp/tokenize/newmm.py | 2 +-
 tests/test_tokenize.py      | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 3a7caac5f..673d7ce57 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -44,7 +44,7 @@
 \d+([,\.]\d+)*|    # numbers
 [ \t]+|            # spaces
 \r?\n|             # newlines
-[^\u0E00-\u0E7F]+  # other non-Thai characters
+[^\u0E00-\u0E7F \t]+  # other non-Thai characters
 """
 )
 
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index c224a675c..1537b62c9 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -653,6 +653,15 @@ def test_newmm(self):
                 keep_whitespace=False,
             )
         )
+        self.assertEqual(
+            word_tokenize("(คนไม่เอา)", engine="newmm"), ['(', 'คน', 'ไม่', 'เอา', ')']
+        )
+        self.assertEqual(
+            word_tokenize("กม/ชม", engine="newmm"), ['กม', '/', 'ชม']
+        )
+        self.assertEqual(
+            word_tokenize("สีหน้า(รถ)", engine="newmm"), ['สีหน้า', '(', 'รถ', ')']
+        )
 
     def test_newmm_longtext(self):
         self.assertIsInstance(

From 3d889f7be4cf90c30bc64285f7fdcdb6ba369656 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 1 Nov 2023 19:57:07 +0700
Subject: [PATCH 10/11] Update newmm.py

---
 pythainlp/tokenize/newmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 673d7ce57..42a2d898b 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -44,7 +44,7 @@
 \d+([,\.]\d+)*|    # numbers
 [ \t]+|            # spaces
 \r?\n|             # newlines
-[^\u0E00-\u0E7F \t]+  # other non-Thai characters
+[^\u0E00-\u0E7F \t\r\n]+  # other non-Thai characters
 """
 )
 

From 2e2f0cf7e86b3bc39cf094c2a154a2732bc3675d Mon Sep 17 00:00:00 2001
From: Konbraphat <101827492+konbraphat51@users.noreply.github.com>
Date: Wed, 1 Nov 2023 22:08:35 +0900
Subject: [PATCH 11/11] Refac: comment for addition

Interntion for ` \t\r\n`
---
 pythainlp/tokenize/newmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
index 42a2d898b..e19272989 100644
--- a/pythainlp/tokenize/newmm.py
+++ b/pythainlp/tokenize/newmm.py
@@ -44,7 +44,7 @@
 \d+([,\.]\d+)*|    # numbers
 [ \t]+|            # spaces
 \r?\n|             # newlines
-[^\u0E00-\u0E7F \t\r\n]+  # other non-Thai characters
+[^\u0E00-\u0E7F \t\r\n]+  # other non-Thai characters, and stops matching until space/newline
 """
 )