Skip to content

Commit

Permalink
Support Unicode Text Segmentation in Tokenizer, closes #507
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Jul 18, 2023
1 parent b21df33 commit dcd8067
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 13 deletions.
76 changes: 65 additions & 11 deletions src/python/txtai/pipeline/data/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@
import re
import string

import regex

from ..base import Pipeline


class Tokenizer(Pipeline):
"""
Tokenizes text into a list of tokens. Primarily designed for English text.
Tokenizes text into tokens using one of the following methods.
1. Backwards compatible tokenization that only accepts alphanumeric tokens from the Latin alphabet.
2. Split using word boundary rules from the Unicode Text Segmentation algorithm (see Unicode Standard Annex #29).
This is similar to the standard tokenizer in Apache Lucene and works well for most languages.
"""

# fmt: off
Expand All @@ -21,22 +28,58 @@ class Tokenizer(Pipeline):
# fmt: on

@staticmethod
def tokenize(text):
def tokenize(text, lowercase=True, emoji=True, alphanum=True, stopwords=True):
"""
Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
Tokenizes text into a list of tokens. The default backwards compatible parameters filter out English stop words and only
accept alphanumeric tokens.
Args:
text: input text
lowercase: lower cases all tokens if True, defaults to True
emoji: tokenize emoji in text if True, defaults to True
alphanum: requires 2+ character alphanumeric tokens if True, defaults to True
stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to True
Returns:
list of tokens
"""

return Tokenizer()(text)
# Create a tokenizer with backwards compatible settings
return Tokenizer(lowercase, emoji, alphanum, stopwords)(text)

def __init__(self, lowercase=True, emoji=True, alphanum=False, stopwords=False):
"""
Creates a new tokenizer. The default parameters segment text per Unicode Standard Annex #29.
Args:
lowercase: lower cases all tokens if True, defaults to True
emoji: tokenize emoji in text if True, defaults to True
alphanum: requires 2+ character alphanumeric tokens if True, defaults to False
stopwords: removes provided stop words if a list, removes default English stop words if True, defaults to False
"""

# Lowercase
self.lowercase = lowercase

# Text segmentation
self.alphanum, self.segment = None, None
if alphanum:
# Alphanumeric regex that accepts tokens that meet following rules:
# - Strings to be at least 2 characters long AND
# - At least 1 non-trailing alpha character in string
# Note: The standard Python re module is much faster than regex for this expression
self.alphanum = re.compile(r"^\d*[a-z][\-.0-9:_a-z]{1,}$")
else:
# Text segmentation per Unicode Standard Annex #29
pattern = r"\w\p{Extended_Pictographic}\p{WB:RegionalIndicator}" if emoji else r"\w"
self.segment = regex.compile(rf"[{pattern}](?:\B\S)*", flags=regex.WORD)

# Stop words
self.stopwords = stopwords if isinstance(stopwords, list) else Tokenizer.STOP_WORDS if stopwords else False

def __call__(self, text):
"""
Tokenizes input text into a list of tokens. Filters tokens that match a specific pattern and removes stop words.
Tokenizes text into a list of tokens.
Args:
text: input text
Expand All @@ -45,10 +88,21 @@ def __call__(self, text):
list of tokens
"""

# Convert to all lowercase, split on whitespace, strip punctuation
tokens = [token.strip(string.punctuation) for token in text.lower().split()]
# Lowercase
text = text.lower() if self.lowercase else text

if self.alphanum:
# Text segmentation using standard split
tokens = [token.strip(string.punctuation) for token in text.split()]

# Filter on alphanumeric strings.
tokens = [token for token in tokens if re.match(self.alphanum, token)]
else:
# Text segmentation per Unicode Standard Annex #29
tokens = regex.findall(self.segment, text)

# Stop words
if self.stopwords:
tokens = [token for token in tokens if token not in self.stopwords]

# Tokenize on alphanumeric strings.
# Require strings to be at least 2 characters long.
# Require at least 1 non-trailing alpha character in string.
return [token for token in tokens if re.match(r"^\d*[a-z][\-.0-9:_a-z]{1,}$", token) and token not in Tokenizer.STOP_WORDS]
return tokens
27 changes: 25 additions & 2 deletions test/python/testpipeline/testtokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,33 @@ class TestTokenizer(unittest.TestCase):
Tokenizer tests.
"""

def testTokenize(self):
def testAlphanumTokenize(self):
"""
Test tokenize
Test alphanumeric tokenization
"""

# Alphanumeric tokenization through backwards compatible static method
self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"])
self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])

def testStandardTokenize(self):
"""
Test standard tokenization
"""

# Default standard tokenizer parameters
tokenizer = Tokenizer()

# Define token tests
tests = [
("Y this is a test!", ["y", "this", "is", "a", "test"]),
("abc123 ABC 123", ["abc123", "abc", "123"]),
("Testing hy-phenated words", ["testing", "hy", "phenated", "words"]),
("111-111-1111", ["111", "111", "1111"]),
("Test.1234", ["test", "1234"]),
]

# Run through tests
for test, result in tests:
# Unicode Text Segmentation per Unicode Annex #29
self.assertEqual(tokenizer(test), result)

0 comments on commit dcd8067

Please sign in to comment.