diff --git a/src/python/txtai/pipeline/data/tokenizer.py b/src/python/txtai/pipeline/data/tokenizer.py index b14106030..4c5b35acf 100644 --- a/src/python/txtai/pipeline/data/tokenizer.py +++ b/src/python/txtai/pipeline/data/tokenizer.py @@ -5,12 +5,19 @@ import re import string +import regex + from ..base import Pipeline class Tokenizer(Pipeline): """ - Tokenizes text into a list of tokens. Primarily designed for English text. + Tokenizes text into tokens using one of the following methods. + + 1. Backwards compatible tokenization that only accepts alphanumeric tokens from the Latin alphabet. + + 2. Split using word boundary rules from the Unicode Text Segmentation algorithm (see Unicode Standard Annex #29). + This is similar to the standard tokenizer in Apache Lucene and works well for most languages. """ # fmt: off @@ -21,22 +28,58 @@ class Tokenizer(Pipeline): # fmt: on @staticmethod - def tokenize(text): + def tokenize(text, lowercase=True, emoji=True, alphanum=True, stopwords=True): """ - Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words. + Tokenizes text into a list of tokens. The default backwards compatible parameters filter out English stop words and only + accept alphanumeric tokens. Args: text: input text + lowercase: lower cases all tokens if True, defaults to True + emoji: tokenize emoji in text if True, defaults to True + alphanum: requires 2+ character alphanumeric tokens if True, defaults to True + stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to True Returns: list of tokens """ - return Tokenizer()(text) + # Create a tokenizer with backwards compatible settings + return Tokenizer(lowercase, emoji, alphanum, stopwords)(text) + + def __init__(self, lowercase=True, emoji=True, alphanum=False, stopwords=False): + """ + Creates a new tokenizer. The default parameters segment text per Unicode Standard Annex #29. + + Args: + lowercase: lower cases all tokens if True, defaults to True + emoji: tokenize emoji in text if True, defaults to True + alphanum: requires 2+ character alphanumeric tokens if True, defaults to False + stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to False + """ + + # Lowercase + self.lowercase = lowercase + + # Text segmentation + self.alphanum, self.segment = None, None + if alphanum: + # Alphanumeric regex that accepts tokens that meet following rules: + # - Strings to be at least 2 characters long AND + # - At least 1 non-trailing alpha character in string + # Note: The standard Python re module is much faster than regex for this expression + self.alphanum = re.compile(r"^\d*[a-z][\-.0-9:_a-z]{1,}$") + else: + # Text segmentation per Unicode Standard Annex #29 + pattern = r"\w\p{Extended_Pictographic}\p{WB:RegionalIndicator}" if emoji else r"\w" + self.segment = regex.compile(rf"[{pattern}](?:\B\S)*", flags=regex.WORD) + + # Stop words + self.stopwords = stopwords if isinstance(stopwords, list) else Tokenizer.STOP_WORDS if stopwords else False def __call__(self, text): """ - Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words. + Tokenizes text into a list of tokens. Args: text: input text @@ -45,10 +88,21 @@ def __call__(self, text): list of tokens """ - # Convert to all lowercase, split on whitespace, strip punctuation - tokens = [token.strip(string.punctuation) for token in text.lower().split()] + # Lowercase + text = text.lower() if self.lowercase else text + + if self.alphanum: + # Text segmentation using standard split + tokens = [token.strip(string.punctuation) for token in text.split()] + + # Filter on alphanumeric strings. + tokens = [token for token in tokens if re.match(self.alphanum, token)] + else: + # Text segmentation per Unicode Standard Annex #29 + tokens = regex.findall(self.segment, text) + + # Stop words + if self.stopwords: + tokens = [token for token in tokens if token not in self.stopwords] - # Tokenize on alphanumeric strings. - # Require strings to be at least 2 characters long. - # Require at least 1 non-trailing alpha character in string. - return [token for token in tokens if re.match(r"^\d*[a-z][\-.0-9:_a-z]{1,}$", token) and token not in Tokenizer.STOP_WORDS] + return tokens diff --git a/test/python/testpipeline/testtokenizer.py b/test/python/testpipeline/testtokenizer.py index bfb100936..57819a1fc 100644 --- a/test/python/testpipeline/testtokenizer.py +++ b/test/python/testpipeline/testtokenizer.py @@ -12,10 +12,33 @@ class TestTokenizer(unittest.TestCase): Tokenizer tests. """ - def testTokenize(self): + def testAlphanumTokenize(self): """ - Test tokenize + Test alphanumeric tokenization """ + # Alphanumeric tokenization through backwards compatible static method self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"]) self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"]) + + def testStandardTokenize(self): + """ + Test standard tokenization + """ + + # Default standard tokenizer parameters + tokenizer = Tokenizer() + + # Define token tests + tests = [ + ("Y this is a test!", ["y", "this", "is", "a", "test"]), + ("abc123 ABC 123", ["abc123", "abc", "123"]), + ("Testing hy-phenated words", ["testing", "hy", "phenated", "words"]), + ("111-111-1111", ["111", "111", "1111"]), + ("Test.1234", ["test", "1234"]), + ] + + # Run through tests + for test, result in tests: + # Unicode Text Segmentation per Unicode Annex #29 + self.assertEqual(tokenizer(test), result)