diff --git a/src/python/txtai/pipeline/data/tokenizer.py b/src/python/txtai/pipeline/data/tokenizer.py index 4c5b35acf..c7a401aa5 100644 --- a/src/python/txtai/pipeline/data/tokenizer.py +++ b/src/python/txtai/pipeline/data/tokenizer.py @@ -88,6 +88,10 @@ def __call__(self, text): list of tokens """ + # Check for None and skip processing + if text is None: + return None + # Lowercase text = text.lower() if self.lowercase else text diff --git a/test/python/testpipeline/testtokenizer.py b/test/python/testpipeline/testtokenizer.py index 57819a1fc..e27e473df 100644 --- a/test/python/testpipeline/testtokenizer.py +++ b/test/python/testpipeline/testtokenizer.py @@ -21,6 +21,15 @@ def testAlphanumTokenize(self): self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"]) self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"]) + def testEmptyTokenize(self): + """ + Test handling empty and None inputs + """ + + # Test that parser can handle empty or None strings + self.assertEqual(Tokenizer.tokenize(""), []) + self.assertEqual(Tokenizer.tokenize(None), None) + def testStandardTokenize(self): """ Test standard tokenization