Support Unicode Text Segmentation in Tokenizer, closes #507

neuml · Jul 18, 2023 · dcd8067 · dcd8067
1 parent b21df33
commit dcd8067
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 13 deletions.
diff --git a/src/python/txtai/pipeline/data/tokenizer.py b/src/python/txtai/pipeline/data/tokenizer.py
@@ -5,12 +5,19 @@
 import re
 import string
 
+import regex
+
 from ..base import Pipeline
 
 
 class Tokenizer(Pipeline):
     """
-    Tokenizes text into a list of tokens. Primarily designed for English text.
+    Tokenizes text into tokens using one of the following methods.
+
+      1. Backwards compatible tokenization that only accepts alphanumeric tokens from the Latin alphabet.
+
+      2. Split using word boundary rules from the Unicode Text Segmentation algorithm (see Unicode Standard Annex #29).
+         This is similar to the standard tokenizer in Apache Lucene and works well for most languages.
     """
 
     # fmt: off
@@ -21,22 +28,58 @@ class Tokenizer(Pipeline):
     # fmt: on
 
     @staticmethod
-    def tokenize(text):
+    def tokenize(text, lowercase=True, emoji=True, alphanum=True, stopwords=True):
         """
-        Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
+        Tokenizes text into a list of tokens. The default backwards compatible parameters filter out English stop words and only
+        accept alphanumeric tokens.
 
         Args:
             text: input text
+            lowercase: lower cases all tokens if True, defaults to True
+            emoji: tokenize emoji in text if True, defaults to True
+            alphanum: requires 2+ character alphanumeric tokens if True, defaults to True
+            stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to True
 
         Returns:
             list of tokens
         """
 
-        return Tokenizer()(text)
+        # Create a tokenizer with backwards compatible settings
+        return Tokenizer(lowercase, emoji, alphanum, stopwords)(text)
+
+    def __init__(self, lowercase=True, emoji=True, alphanum=False, stopwords=False):
+        """
+        Creates a new tokenizer. The default parameters segment text per Unicode Standard Annex #29.
+
+        Args:
+            lowercase: lower cases all tokens if True, defaults to True
+            emoji: tokenize emoji in text if True, defaults to True
+            alphanum: requires 2+ character alphanumeric tokens if True, defaults to False
+            stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to False
+        """
+
+        # Lowercase
+        self.lowercase = lowercase
+
+        # Text segmentation
+        self.alphanum, self.segment = None, None
+        if alphanum:
+            # Alphanumeric regex that accepts tokens that meet following rules:
+            #  - Strings to be at least 2 characters long AND
+            #  - At least 1 non-trailing alpha character in string
+            # Note: The standard Python re module is much faster than regex for this expression
+            self.alphanum = re.compile(r"^\d*[a-z][\-.0-9:_a-z]{1,}$")
+        else:
+            # Text segmentation per Unicode Standard Annex #29
+            pattern = r"\w\p{Extended_Pictographic}\p{WB:RegionalIndicator}" if emoji else r"\w"
+            self.segment = regex.compile(rf"[{pattern}](?:\B\S)*", flags=regex.WORD)
+
+        # Stop words
+        self.stopwords = stopwords if isinstance(stopwords, list) else Tokenizer.STOP_WORDS if stopwords else False
 
     def __call__(self, text):
         """
-        Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
+        Tokenizes text into a list of tokens.
 
         Args:
             text: input text
@@ -45,10 +88,21 @@ def __call__(self, text):
             list of tokens
         """
 
-        # Convert to all lowercase, split on whitespace, strip punctuation
-        tokens = [token.strip(string.punctuation) for token in text.lower().split()]
+        # Lowercase
+        text = text.lower() if self.lowercase else text
+
+        if self.alphanum:
+            # Text segmentation using standard split
+            tokens = [token.strip(string.punctuation) for token in text.split()]
+
+            # Filter on alphanumeric strings.
+            tokens = [token for token in tokens if re.match(self.alphanum, token)]
+        else:
+            # Text segmentation per Unicode Standard Annex #29
+            tokens = regex.findall(self.segment, text)
+
+        # Stop words
+        if self.stopwords:
+            tokens = [token for token in tokens if token not in self.stopwords]
 
-        # Tokenize on alphanumeric strings.
-        # Require strings to be at least 2 characters long.
-        # Require at least 1 non-trailing alpha character in string.
-        return [token for token in tokens if re.match(r"^\d*[a-z][\-.0-9:_a-z]{1,}$", token) and token not in Tokenizer.STOP_WORDS]
+        return tokens
diff --git a/test/python/testpipeline/testtokenizer.py b/test/python/testpipeline/testtokenizer.py
@@ -12,10 +12,33 @@ class TestTokenizer(unittest.TestCase):
     Tokenizer tests.
     """
 
-    def testTokenize(self):
+    def testAlphanumTokenize(self):
         """
-        Test tokenize
+        Test alphanumeric tokenization
         """
 
+        # Alphanumeric tokenization through backwards compatible static method
         self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"])
         self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])
+
+    def testStandardTokenize(self):
+        """
+        Test standard tokenization
+        """
+
+        # Default standard tokenizer parameters
+        tokenizer = Tokenizer()
+
+        # Define token tests
+        tests = [
+            ("Y this is a test!", ["y", "this", "is", "a", "test"]),
+            ("abc123 ABC 123", ["abc123", "abc", "123"]),
+            ("Testing hy-phenated words", ["testing", "hy", "phenated", "words"]),
+            ("111-111-1111", ["111", "111", "1111"]),
+            ("Test.1234", ["test", "1234"]),
+        ]
+
+        # Run through tests
+        for test, result in tests:
+            # Unicode Text Segmentation per Unicode Annex #29
+            self.assertEqual(tokenizer(test), result)