Handle empty token list with word vectorization, closes #861

neuml · Jan 30, 2025 · e2e81cd · e2e81cd
1 parent fb56af8
commit e2e81cd
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/src/python/txtai/vectors/words.py b/src/python/txtai/vectors/words.py
@@ -125,9 +125,10 @@ def encode(self, data):
         # Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
         embeddings = []
         for tokens in data:
-            # Convert to tokens if necessary
+            # Convert to tokens, if necessary. If tokenized list is empty, use input string.
             if isinstance(tokens, str):
-                tokens = Tokenizer.tokenize(tokens)
+                tokenlist = Tokenizer.tokenize(tokens)
+                tokens = tokenlist if tokenlist else [tokens]
 
             # Generate weights for each vector using a scoring method
             weights = self.scoring.weights(tokens) if self.scoring else None