Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Add embedding models support #418

Merged
merged 6 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ jobs:
pip install transformers_stream_generator
pip install bitsandbytes
pip install ctransformers
pip install sentence-transformers
pip install -e ".[dev]"
working-directory: .

Expand Down
2 changes: 1 addition & 1 deletion examples/LangChain_QA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -404,4 +404,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ all =
protobuf
einops
tiktoken
sentence-transformers
ggml =
llama-cpp-python==0.1.77
ctransformers
Expand All @@ -86,6 +87,8 @@ pytorch =
protobuf
einops
tiktoken
embedding =
sentence-transformers
doc =
ipython>=6.5.0
sphinx>=3.0.0,<5.0.0
Expand Down
144 changes: 80 additions & 64 deletions xinference/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,29 @@ def __init__(self, model_ref: xo.ActorRefType["ModelActor"], isolation: Isolatio
self._isolation = isolation


class GenerateModelHandle(ModelHandle):
class EmbeddingModelHandle(ModelHandle):
def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
"""
Creates an embedding vector representing the input text.

Parameters
----------
input: Union[str, List[str]]
Input text to embed, encoded as a string or array of tokens.
To embed multiple inputs in a single request, pass an array of strings or array of token arrays.

Returns
-------
Embedding
The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.

"""

coro = self._model_ref.create_embedding(input)
return self._isolation.call(coro)


class GenerateModelHandle(EmbeddingModelHandle):
def generate(
self,
prompt: str,
Expand Down Expand Up @@ -81,26 +103,6 @@ def generate(
coro = self._model_ref.generate(prompt, generate_config)
return self._isolation.call(coro)

def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
"""
Creates an embedding vector representing the input text.

Parameters
----------
input: Union[str, List[str]]
Input text to embed, encoded as a string or array of tokens.
To embed multiple inputs in a single request, pass an array of strings or array of token arrays.

Returns
-------
Embedding
The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.

"""

coro = self._model_ref.create_embedding(input)
return self._isolation.call(coro)


class ChatModelHandle(GenerateModelHandle):
def chat(
Expand Down Expand Up @@ -147,7 +149,7 @@ def chat(
return self._isolation.call(coro)


class ChatglmCppChatModelHandle(ModelHandle):
class ChatglmCppChatModelHandle(EmbeddingModelHandle):
def chat(
self,
prompt: str,
Expand Down Expand Up @@ -241,7 +243,41 @@ def __init__(self, model_uid: str, base_url: str):
self._base_url = base_url


class RESTfulGenerateModelHandle(RESTfulModelHandle):
class RESTfulEmbeddingModelHandle(RESTfulModelHandle):
def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
"""
Create an Embedding from user input via RESTful APIs.

Parameters
----------
input: Union[str, List[str]]
Input text to embed, encoded as a string or array of tokens.
To embed multiple inputs in a single request, pass an array of strings or array of token arrays.

Returns
-------
Embedding
The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.

Raises
------
RuntimeError
Report the failure of embeddings and provide the error message.

"""
url = f"{self._base_url}/v1/embeddings"
request_body = {"model": self._model_uid, "input": input}
response = requests.post(url, json=request_body)
if response.status_code != 200:
raise RuntimeError(
f"Failed to create the embeddings, detail: {response.json()['detail']}"
)

response_data = response.json()
return response_data


class RESTfulGenerateModelHandle(RESTfulEmbeddingModelHandle):
def generate(
self,
prompt: str,
Expand Down Expand Up @@ -296,38 +332,6 @@ def generate(
response_data = response.json()
return response_data

def create_embedding(self, input: Union[str, List[str]]) -> "Embedding":
"""
Create an Embedding from user input via RESTful APIs.

Parameters
----------
input: Union[str, List[str]]
Input text to embed, encoded as a string or array of tokens.
To embed multiple inputs in a single request, pass an array of strings or array of token arrays.

Returns
-------
Embedding
The resulted Embedding vector that can be easily consumed by machine learning models and algorithms.

Raises
------
RuntimeError
Report the failure of embeddings and provide the error message.

"""
url = f"{self._base_url}/v1/embeddings"
request_body = {"model": self._model_uid, "input": input}
response = requests.post(url, json=request_body)
if response.status_code != 200:
raise RuntimeError(
f"Failed to create the embeddings, detail: {response.json()['detail']}"
)

response_data = response.json()
return response_data


class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
def chat(
Expand Down Expand Up @@ -407,7 +411,7 @@ def chat(
return response_data


class RESTfulChatglmCppChatModelHandle(RESTfulModelHandle):
class RESTfulChatglmCppChatModelHandle(RESTfulEmbeddingModelHandle):
def chat(
self,
prompt: str,
Expand Down Expand Up @@ -556,6 +560,7 @@ def get_model_registration(
def launch_model(
self,
model_name: str,
model_type: str = "LLM",
model_size_in_billions: Optional[int] = None,
model_format: Optional[str] = None,
quantization: Optional[str] = None,
Expand All @@ -568,6 +573,8 @@ def launch_model(
----------
model_name: str
The name of model.
model_type: str
Type of model.
model_size_in_billions: Optional[int]
The size (in billions) of the model.
model_format: Optional[str]
Expand All @@ -589,6 +596,7 @@ def launch_model(
coro = self._supervisor_ref.launch_builtin_model(
model_uid=model_uid,
model_name=model_name,
model_type=model_type,
model_size_in_billions=model_size_in_billions,
model_format=model_format,
quantization=quantization,
Expand Down Expand Up @@ -648,15 +656,19 @@ def get_model(self, model_uid: str) -> "ModelHandle":
self._supervisor_ref.describe_model(model_uid)
)
model_ref = self._isolation.call(self._supervisor_ref.get_model(model_uid))

if desc["model_format"] == "ggmlv3" and "chatglm" in desc["model_name"]:
return ChatglmCppChatModelHandle(model_ref, self._isolation)
elif "chat" in desc["model_ability"]:
return ChatModelHandle(model_ref, self._isolation)
elif "generate" in desc["model_ability"]:
return GenerateModelHandle(model_ref, self._isolation)
if desc["model_type"] == "LLM":
if desc["model_format"] == "ggmlv3" and "chatglm" in desc["model_name"]:
return ChatglmCppChatModelHandle(model_ref, self._isolation)
elif "chat" in desc["model_ability"]:
return ChatModelHandle(model_ref, self._isolation)
elif "generate" in desc["model_ability"]:
return GenerateModelHandle(model_ref, self._isolation)
else:
raise ValueError(f"Unrecognized model ability: {desc['model_ability']}")
elif desc["model_type"] == "embedding":
return EmbeddingModelHandle(model_ref, self._isolation)
else:
raise ValueError(f"Unrecognized model ability: {desc['model_ability']}")
raise ValueError(f"Unknown model type:{desc['model_type']}")


class RESTfulClient:
Expand Down Expand Up @@ -693,6 +705,7 @@ def list_models(self) -> Dict[str, Dict[str, Any]]:
def launch_model(
self,
model_name: str,
model_type: str = "LLM",
model_size_in_billions: Optional[int] = None,
model_format: Optional[str] = None,
quantization: Optional[str] = None,
Expand All @@ -705,6 +718,8 @@ def launch_model(
----------
model_name: str
The name of model.
model_type: str
type of model.
model_size_in_billions: Optional[int]
The size (in billions) of the model.
model_format: Optional[str]
Expand All @@ -728,6 +743,7 @@ def launch_model(
payload = {
"model_uid": model_uid,
"model_name": model_name,
"model_type": model_type,
"model_size_in_billions": model_size_in_billions,
"model_format": model_format,
"quantization": quantization,
Expand Down
Loading