From 79072841fed886b046c00f70ccb7120cf1e4ddca Mon Sep 17 00:00:00 2001 From: xiyuan lee Date: Fri, 29 Nov 2024 10:11:15 +0800 Subject: [PATCH 1/2] BUG: Correct the input bytes data by langchain_openai #2589 --- xinference/model/embedding/core.py | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index 5848aa9289..cedb2e848b 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -21,6 +21,7 @@ import numpy as np import torch +from ..._compat import ROOT_KEY, ErrorWrapper, ValidationError from ...device_utils import empty_cache from ...types import Embedding, EmbeddingData, EmbeddingUsage from ..core import CacheableModelSpec, ModelDescription @@ -223,7 +224,39 @@ def to(self, *args, **kwargs): trust_remote_code=True, ) + def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]): + # Check if sentences is a two-dimensional list of integers + if isinstance(sentences, list) and all( + isinstance(item, list) and all(isinstance(i, int) for i in item) + for item in sentences + ): + import tiktoken + + enc = tiktoken.get_encoding("cl100k_base") + lines_decoded = [] + + for line in sentences: + try: + # Decode each token into bytes, then join them into a complete string + output = b"".join( + enc.decode_single_token_bytes(token) for token in line + ) + # Convert the byte sequence into a UTF-8 encoded string + decoded_line = output.decode("utf-8") + lines_decoded.append(decoded_line) + except (ValueError, TypeError, UnicodeDecodeError) as e: + raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], self) + + # Update sentences to be the list of decoded strings + if len(lines_decoded) == 1: + sentences = lines_decoded[0] + else: + sentences = lines_decoded + return sentences + def create_embedding(self, sentences: Union[str, List[str]], **kwargs): + sentences = self._fix_langchain_openai_inputs(sentences) + from FlagEmbedding import BGEM3FlagModel from sentence_transformers import SentenceTransformer From e14194fe99c35243bfeb5eeac5195c73b0606a91 Mon Sep 17 00:00:00 2001 From: qinxuye Date: Fri, 29 Nov 2024 07:59:28 +0000 Subject: [PATCH 2/2] fix --- xinference/model/embedding/core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xinference/model/embedding/core.py b/xinference/model/embedding/core.py index cedb2e848b..01ba9da65d 100644 --- a/xinference/model/embedding/core.py +++ b/xinference/model/embedding/core.py @@ -226,10 +226,14 @@ def to(self, *args, **kwargs): def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]): # Check if sentences is a two-dimensional list of integers - if isinstance(sentences, list) and all( - isinstance(item, list) and all(isinstance(i, int) for i in item) - for item in sentences + if ( + isinstance(sentences, list) + and len(sentences) > 0 + and isinstance(sentences[0], list) + and len(sentences[0]) > 0 + and isinstance(sentences[0][0], int) ): + # List[List[int]] stands for encoded inputs import tiktoken enc = tiktoken.get_encoding("cl100k_base")