Skip to content

Commit

Permalink
Handle None input in Tokenizer, closes #607
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Dec 3, 2023
1 parent 07069ad commit 6e14cfe
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/python/txtai/pipeline/data/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ def __call__(self, text):
list of tokens
"""

# Check for None and skip processing
if text is None:
return None

# Lowercase
text = text.lower() if self.lowercase else text

Expand Down
9 changes: 9 additions & 0 deletions test/python/testpipeline/testtokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ def testAlphanumTokenize(self):
self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"])
self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])

def testEmptyTokenize(self):
"""
Test handling empty and None inputs
"""

# Test that parser can handle empty or None strings
self.assertEqual(Tokenizer.tokenize(""), [])
self.assertEqual(Tokenizer.tokenize(None), None)

def testStandardTokenize(self):
"""
Test standard tokenization
Expand Down

0 comments on commit 6e14cfe

Please sign in to comment.