From 6e14cfef4106f9dbecd9aa79dc807d7c92252f8e Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Sun, 3 Dec 2023 10:35:47 -0500 Subject: [PATCH] Handle None input in Tokenizer, closes #607 --- src/python/txtai/pipeline/data/tokenizer.py | 4 ++++ test/python/testpipeline/testtokenizer.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/src/python/txtai/pipeline/data/tokenizer.py b/src/python/txtai/pipeline/data/tokenizer.py index 4c5b35acf..c7a401aa5 100644 --- a/src/python/txtai/pipeline/data/tokenizer.py +++ b/src/python/txtai/pipeline/data/tokenizer.py @@ -88,6 +88,10 @@ def __call__(self, text): list of tokens """ + # Check for None and skip processing + if text is None: + return None + # Lowercase text = text.lower() if self.lowercase else text diff --git a/test/python/testpipeline/testtokenizer.py b/test/python/testpipeline/testtokenizer.py index 57819a1fc..e27e473df 100644 --- a/test/python/testpipeline/testtokenizer.py +++ b/test/python/testpipeline/testtokenizer.py @@ -21,6 +21,15 @@ def testAlphanumTokenize(self): self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"]) self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"]) + def testEmptyTokenize(self): + """ + Test handling empty and None inputs + """ + + # Test that parser can handle empty or None strings + self.assertEqual(Tokenizer.tokenize(""), []) + self.assertEqual(Tokenizer.tokenize(None), None) + def testStandardTokenize(self): """ Test standard tokenization