BUG: Correct the input bytes data by langchain_openai #2589

xorbitsai · Nov 28, 2024 · 6d3255a · 6d3255a
1 parent 0d4cb9c
commit 6d3255a
Showing 1 changed file with 28 additions and 0 deletions.
diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py
@@ -19,8 +19,10 @@
 from typing import Dict, List, Literal, Optional, Tuple, Union, no_type_check
 
 import numpy as np
+import tiktoken
 import torch
 
+from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError
 from ...device_utils import empty_cache
 from ...types import Embedding, EmbeddingData, EmbeddingUsage
 from ..core import CacheableModelSpec, ModelDescription
@@ -224,6 +226,32 @@ def to(self, *args, **kwargs):
             )
 
     def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
+        # Check if sentences is a two-dimensional list of integers
+        if isinstance(sentences, list) and all(
+            isinstance(item, list) and all(isinstance(i, int) for i in item)
+            for item in sentences
+        ):
+            enc = tiktoken.get_encoding("cl100k_base")
+            lines_decoded = []
+
+            for line in sentences:
+                try:
+                    # Decode each token into bytes, then join them into a complete string
+                    output = b"".join(
+                        enc.decode_single_token_bytes(token) for token in line
+                    )
+                    # Convert the byte sequence into a UTF-8 encoded string
+                    decoded_line = output.decode("utf-8")
+                    lines_decoded.append(decoded_line)
+
+                    logger.info(f"235 ------> {type(sentences)}, {sentences=}")
+                    # raise UnicodeDecodeError
+                except (ValueError, TypeError, UnicodeDecodeError) as e:
+                    logger.error(f"238 ------> {type(sentences)}, {sentences=}")
+                    raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], self)
+
+            # Update sentences to be the list of decoded strings
+            sentences = lines_decoded
         from FlagEmbedding import BGEM3FlagModel
         from sentence_transformers import SentenceTransformer