xorbitsai · UranusSeven · Sep 5, 2023 · Sep 1, 2023 · Sep 2, 2023 · Sep 2, 2023
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -93,6 +93,7 @@ jobs:
           pip install transformers_stream_generator
           pip install bitsandbytes
           pip install ctransformers
+          pip install sentence-transformers
           pip install -e ".[dev]"
         working-directory: .
 

diff --git a/examples/LangChain_QA.ipynb b/examples/LangChain_QA.ipynb
@@ -404,4 +404,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/setup.cfg b/setup.cfg
@@ -73,6 +73,7 @@ all =
     protobuf
     einops
     tiktoken
+    sentence-transformers
 ggml =
     llama-cpp-python==0.1.77
     ctransformers
@@ -86,6 +87,8 @@ pytorch =
     protobuf
     einops
     tiktoken
+embedding =
+    sentence-transformers
 doc =
     ipython>=6.5.0
     sphinx>=3.0.0,<5.0.0

diff --git a/xinference/client.py b/xinference/client.py
@@ -49,7 +49,29 @@ def __init__(self, model_ref: xo.ActorRefType["ModelActor"], isolation: Isolatio
         self._isolation = isolation
 
 
-class GenerateModelHandle(ModelHandle):
+class EmbeddingModelHandle(ModelHandle):
+    def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
+        """
+        Creates an embedding vector representing the input text.
+
+        Parameters
+        ----------
+        input: Union[str, List[str]]
+            Input text to embed, encoded as a string or array of tokens.
+            To embed multiple inputs in a single request, pass an array of strings or array of token arrays.
+
+        Returns
+        -------
+        Embedding
+            The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.
+
+        """
+
+        coro = self._model_ref.create_embedding(input)
+        return self._isolation.call(coro)
+
+
+class GenerateModelHandle(EmbeddingModelHandle):
     def generate(
         self,
         prompt: str,
@@ -81,26 +103,6 @@ def generate(
         coro = self._model_ref.generate(prompt, generate_config)
         return self._isolation.call(coro)
 
-    def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
-        """
-        Creates an embedding vector representing the input text.
-
-        Parameters
-        ----------
-        input: Union[str, List[str]]
-            Input text to embed, encoded as a string or array of tokens.
-            To embed multiple inputs in a single request, pass an array of strings or array of token arrays.
-
-        Returns
-        -------
-        Embedding
-            The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.
-
-        """
-
-        coro = self._model_ref.create_embedding(input)
-        return self._isolation.call(coro)
-
 
 class ChatModelHandle(GenerateModelHandle):
     def chat(
@@ -147,7 +149,7 @@ def chat(
         return self._isolation.call(coro)
 
 
-class ChatglmCppChatModelHandle(ModelHandle):
+class ChatglmCppChatModelHandle(EmbeddingModelHandle):
     def chat(
         self,
         prompt: str,
@@ -241,7 +243,41 @@ def __init__(self, model_uid: str, base_url: str):
         self._base_url = base_url
 
 
-class RESTfulGenerateModelHandle(RESTfulModelHandle):
+class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
+    def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
+        """
+        Create an Embedding from user input via RESTful APIs.
+
+        Parameters
+        ----------
+        input: Union[str, List[str]]
+            Input text to embed, encoded as a string or array of tokens.
+            To embed multiple inputs in a single request, pass an array of strings or array of token arrays.
+
+        Returns
+        -------
+        Embedding
+           The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.
+
+        Raises
+        ------
+        RuntimeError
+            Report the failure of embeddings and provide the error message.
+
+        """
+        url = f"{self._base_url}/v1/embeddings"
+        request_body = {"model": self._model_uid, "input": input}
+        response = requests.post(url, json=request_body)
+        if response.status_code != 200:
+            raise RuntimeError(
+                f"Failed to create the embeddings, detail: {response.json()['detail']}"
+            )
+
+        response_data = response.json()
+        return response_data
+
+
+class RESTfulGenerateModelHandle(RESTfulEmbeddingModelHandle):
     def generate(
         self,
         prompt: str,
@@ -296,38 +332,6 @@ def generate(
         response_data = response.json()
         return response_data
 
-    def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
-        """
-        Create an Embedding from user input via RESTful APIs.
-
-        Parameters
-        ----------
-        input: Union[str, List[str]]
-            Input text to embed, encoded as a string or array of tokens.
-            To embed multiple inputs in a single request, pass an array of strings or array of token arrays.
-
-        Returns
-        -------
-        Embedding
-           The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.
-
-        Raises
-        ------
-        RuntimeError
-            Report the failure of embeddings and provide the error message.
-
-        """
-        url = f"{self._base_url}/v1/embeddings"
-        request_body = {"model": self._model_uid, "input": input}
-        response = requests.post(url, json=request_body)
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Failed to create the embeddings, detail: {response.json()['detail']}"
-            )
-
-        response_data = response.json()
-        return response_data
-
 
 class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
     def chat(
@@ -407,7 +411,7 @@ def chat(
         return response_data
 
 
-class RESTfulChatglmCppChatModelHandle(RESTfulModelHandle):
+class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
     def chat(
         self,
         prompt: str,
@@ -556,6 +560,7 @@ def get_model_registration(
     def launch_model(
         self,
         model_name: str,
+        model_type: str = "LLM",
         model_size_in_billions: Optional[int] = None,
         model_format: Optional[str] = None,
         quantization: Optional[str] = None,
@@ -568,6 +573,8 @@ def launch_model(
         ----------
         model_name: str
             The name of model.
+        model_type: str
+            Type of model.
         model_size_in_billions: Optional[int]
             The size (in billions) of the model.
         model_format: Optional[str]
@@ -589,6 +596,7 @@ def launch_model(
         coro = self._supervisor_ref.launch_builtin_model(
             model_uid=model_uid,
             model_name=model_name,
+            model_type=model_type,
             model_size_in_billions=model_size_in_billions,
             model_format=model_format,
             quantization=quantization,
@@ -648,15 +656,19 @@ def get_model(self, model_uid: str) -> "ModelHandle":
             self._supervisor_ref.describe_model(model_uid)
         )
         model_ref = self._isolation.call(self._supervisor_ref.get_model(model_uid))
-
-        if desc["model_format"] == "ggmlv3" and "chatglm" in desc["model_name"]:
-            return ChatglmCppChatModelHandle(model_ref, self._isolation)
-        elif "chat" in desc["model_ability"]:
-            return ChatModelHandle(model_ref, self._isolation)
-        elif "generate" in desc["model_ability"]:
-            return GenerateModelHandle(model_ref, self._isolation)
+        if desc["model_type"] == "LLM":
+            if desc["model_format"] == "ggmlv3" and "chatglm" in desc["model_name"]:
+                return ChatglmCppChatModelHandle(model_ref, self._isolation)
+            elif "chat" in desc["model_ability"]:
+                return ChatModelHandle(model_ref, self._isolation)
+            elif "generate" in desc["model_ability"]:
+                return GenerateModelHandle(model_ref, self._isolation)
+            else:
+                raise ValueError(f"Unrecognized model ability: {desc['model_ability']}")
+        elif desc["model_type"] == "embedding":
+            return EmbeddingModelHandle(model_ref, self._isolation)
         else:
-            raise ValueError(f"Unrecognized model ability: {desc['model_ability']}")
+            raise ValueError(f"Unknown model type:{desc['model_type']}")
 
 
 class RESTfulClient:
@@ -693,6 +705,7 @@ def list_models(self) -> Dict[str, Dict[str, Any]]:
     def launch_model(
         self,
         model_name: str,
+        model_type: str = "LLM",
         model_size_in_billions: Optional[int] = None,
         model_format: Optional[str] = None,
         quantization: Optional[str] = None,
@@ -705,6 +718,8 @@ def launch_model(
         ----------
         model_name: str
             The name of model.
+        model_type: str
+            type of model.
         model_size_in_billions: Optional[int]
             The size (in billions) of the model.
         model_format: Optional[str]
@@ -728,6 +743,7 @@ def launch_model(
         payload = {
             "model_uid": model_uid,
             "model_name": model_name,
+            "model_type": model_type,
             "model_size_in_billions": model_size_in_billions,
             "model_format": model_format,
             "quantization": quantization,
-Original file line number
+Diff line change
@@ Expand Up / @@ -404,4 +404,4 @@ @@
      },
      "nbformat": 4,
      "nbformat_minor": 2
-    }
+    }