Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deployment: add local model deployment option #77

Merged
merged 8 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ A model deployment is a running version of one of the Cohere command models. The
- This model deployment calls into your Azure deployment. To get an Azure deployment [follow these steps](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-cohere-command). Once you have a model deployed you will need to get the endpoint URL and API key from the azure AI studio https://ai.azure.com/build/ -> Project -> Deployments -> Click your deployment -> You will see your URL and API Key. Note to use the Cohere SDK you need to add `/v1` to the end of the url.
- SageMaker (model_deployments/sagemaker.py)
- This deployment option calls into your SageMaker deployment. To create a SageMaker endpoint [follow the steps here](https://docs.cohere.com/docs/amazon-sagemaker-setup-guide), alternatively [follow a command notebook here](https://github.com/cohere-ai/cohere-aws/tree/main/notebooks/sagemaker). Note your region and endpoint name when executing the notebook as these will be needed in the environment variables.
- Local models with LlamaCPP (community/model_deployments/local_model.py)
- This deployment option calls into a local model. To use this deployment you will need to download a model. You can use Cohere command models or choose between a range of other models that you can see [here](https://github.com/ggerganov/llama.cpp). You will need to enable community features to use this deployment by setting `USE_COMMUNITY_FEATURES=True` in your .env file.
- To add your own deployment:
1. Create a deployment file, add it to [/community/model_deployments](https://github.com/cohere-ai/cohere-toolkit/tree/main/src/community/model_deployments) folder, implement the function calls from `BaseDeployment` similar to the other deployments.
2. Add the deployment to [src/community/config/deployments.py](https://github.com/cohere-ai/cohere-toolkit/blob/main/src/community/config/deployments.py)
Expand Down
2 changes: 1 addition & 1 deletion docs/postman/Toolkit.postman_collection.json
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@
{
"key": "file",
"type": "file",
"src": "/Users/luisa/Downloads/Aya_dataset__ACL_edition.pdf"
"src": "/Users/luisa/Downloads/Aya_dataset.pdf"
}
]
},
Expand Down
35 changes: 34 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ llama-index = "^0.10.11"
wolframalpha = "^5.0.0"
transformers = "^4.40.1"
torch = "^2.3.0"
llama-cpp-python = "^0.2.67"

[build-system]
requires = ["poetry-core"]
Expand Down
2 changes: 1 addition & 1 deletion src/backend/chat/custom/model_deployments/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_deployment(deployment_name) -> BaseDeployment:

# Check provided deployment against config const
if deployment is not None and deployment.is_available:
return deployment.deployment_class()
return deployment.deployment_class(**deployment.kwargs)

# Fallback to first available deployment
for deployment in AVAILABLE_MODEL_DEPLOYMENTS.values():
Expand Down
3 changes: 2 additions & 1 deletion src/backend/schemas/deployment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Type
from typing import Optional, Type

from pydantic import BaseModel, Field

Expand All @@ -11,6 +11,7 @@ class Deployment(BaseModel):
is_available: bool = Field(exclude=True)
deployment_class: Type[BaseDeployment] = Field(exclude=True)
env_vars: list[str]
kwargs: Optional[dict] = Field(exclude=True, default={})

class Config:
from_attributes = True
Expand Down
12 changes: 12 additions & 0 deletions src/community/config/deployments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from community.model_deployments import Deployment
from community.model_deployments.hugging_face import HuggingFaceDeployment
from community.model_deployments.local_model import LocalModelDeployment


class ModelDeploymentName(StrEnum):
HuggingFace = "HuggingFace"
LocalModel = "LocalModel"


AVAILABLE_MODEL_DEPLOYMENTS = {
Expand All @@ -16,4 +18,14 @@ class ModelDeploymentName(StrEnum):
is_available=HuggingFaceDeployment.is_available(),
env_vars=[],
),
ModelDeploymentName.LocalModel: Deployment(
name=ModelDeploymentName.LocalModel,
deployment_class=LocalModelDeployment,
models=LocalModelDeployment.list_models(),
is_available=LocalModelDeployment.is_available(),
env_vars=[],
kwargs={
"model_path": "path/to/model", # Note that the model needs to be in the src directory
},
),
}
264 changes: 264 additions & 0 deletions src/community/model_deployments/local_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
import logging
from typing import Any, Dict, List

from llama_cpp import Llama

from backend.schemas.cohere_chat import CohereChatRequest
from community.model_deployments import BaseDeployment


class LocalModelDeployment(BaseDeployment):
def __init__(self, model_path: str, template: str = None):
self.prompt_template = PromptTemplate()
self.model_path = model_path
self.template = template

@property
def rerank_enabled(self) -> bool:
return False

@classmethod
def list_models(cls) -> List[str]:
return []

@classmethod
def is_available(cls) -> bool:
return True

def invoke_chat_stream(self, chat_request: CohereChatRequest, **kwargs: Any) -> Any:
model = self._get_model()

if chat_request.max_tokens is None:
chat_request.max_tokens = 200

if len(chat_request.documents) == 0:
prompt = self.prompt_template.dummy_chat_template(
chat_request.message, chat_request.chat_history
)
else:
prompt = self.prompt_template.dummy_rag_template(
chat_request.message, chat_request.chat_history, chat_request.documents
)

stream = model(
prompt,
stream=True,
max_tokens=chat_request.max_tokens,
temperature=chat_request.temperature,
)

yield {
"event_type": "stream-start",
"generation_id": "",
"is_finished": False,
}

for item in stream:
yield {
"event_type": "text-generation",
"text": item["choices"][0]["text"],
"is_finished": False,
}

yield {
"event_type": "stream-end",
"finish_reason": "COMPLETE",
"is_finished": True,
}

def invoke_chat(self, chat_request: CohereChatRequest, **kwargs: Any) -> Any:
model = self._get_model()

if chat_request.max_tokens is None:
chat_request.max_tokens = 200

response = model(
chat_request.message,
stream=False,
max_tokens=chat_request.max_tokens,
temperature=chat_request.temperature,
)

return {"text": response["choices"][0]["text"]}

def _get_model(self):
model = Llama(
model_path=self.model_path,
verbose=False,
)

return model

def invoke_search_queries(
self,
message: str,
chat_history: List[Dict[str, str]] | None = None,
**kwargs: Any,
) -> List[str]:
return [message]

def invoke_rerank(
self, query: str, documents: List[Dict[str, Any]], **kwargs: Any
) -> Any:
return None


class PromptTemplate:
"""
Template for generating prompts for different types of requests.
"""

def dummy_chat_template(
self, message: str, chat_history: List[Dict[str, str]]
) -> str:
prompt = "System: You are an AI assistant whose goal is to help users by consuming and using the output of various tools. You will be able to see the conversation history between yourself and user and will follow instructions on how to respond."
prompt += "\n\n"
prompt += "Conversation:\n"
for chat in chat_history:
if chat["role"].lower() == "user":
prompt += f"User: {chat['message']}\n"
else:
prompt += f"Chatbot: {chat['message']}\n"

prompt += f"User: {message}\n"
prompt += "Chatbot: "

return prompt

def dummy_rag_template(
self,
message: str,
chat_history: List[Dict[str, str]],
documents: List[Dict[str, str]],
max_docs: int = 5,
) -> str:
max_docs = min(max_docs, len(documents))
prompt = "System: You are an AI assistant whose goal is to help users by consuming and using the output of various tools. You will be able to see the conversation history between yourself and user and will follow instructions on how to respond."

doc_str_list = []
for doc_idx, doc in enumerate(documents[:max_docs]):
if doc_idx > 0:
doc_str_list.append("")

# only use first 200 words of the document to avoid exceeding context window
text = doc["text"]
if len(text.split()) > 200:
text = " ".join(text.split()[:200])

doc_str_list.extend([f"Document: {doc_idx}", doc["title"], text])

doc_str = "\n".join(doc_str_list)

chat_history.append({"role": "system", "message": doc_str})
chat_history.append({"role": "user", "message": message})

chat_hist_str = ""
for turn in chat_history:
if turn["role"].lower() == "user":
chat_hist_str += "User: "
elif turn["role"].lower() == "chatbot":
chat_hist_str += "Chatbot: "
else: # role == system
chat_hist_str += "System: "

chat_hist_str += turn["message"] + "\n"

prompt += "\n\n"
prompt += "Conversation:\n"
prompt += chat_hist_str
prompt += "Chatbot: "

return prompt

# https://docs.cohere.com/docs/prompting-command-r#formatting-chat-history-and-tool-outputs
def cohere_rag_template(
self,
message: str,
chat_history: List[Dict[str, str]],
documents: List[Dict[str, str]],
preamble: str = None,
max_docs: int = 5,
) -> str:
max_docs = min(max_docs, len(documents))
chat_history.append({"role": "user", "message": message})
SAFETY_PREAMBLE = "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral."
BASIC_RULES = "You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions."
TASK_CONTEXT = "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging."
STYLE_GUIDE = "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."
documents = self._get_cohere_documents_template(documents, max_docs)
chat_history = self._get_cohere_chat_history_template(chat_history)
INSTRUCTIONS = """Carefully perform the following instructions, in order, starting each with a new line.
Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.
Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.
Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.
Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols <co: doc> and </co: doc> to indicate when a fact comes from a document in the search result, e.g <co: 0>my fact</co: 0> for a fact from document 0."""

tool_prompt_template = f"""<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> # Safety Preamble
{SAFETY_PREAMBLE}

# System Preamble
## Basic Rules
{BASIC_RULES}

# User Preamble
"""
if preamble:
tool_prompt_template += f"""{preamble}\n\n"""

tool_prompt_template += f"""## Task and Context
{TASK_CONTEXT}

## Style Guide
{STYLE_GUIDE}<|END_OF_TURN_TOKEN|>{chat_history}"""

if documents:
tool_prompt_template += f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{documents}<|END_OF_TURN_TOKEN|>"""

tool_prompt_template += f"""<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{INSTRUCTIONS}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"""

return tool_prompt_template

def _get_cohere_documents_template(
self, documents: List[Dict[str, str]], max_docs: int
) -> str:
max_docs = min(max_docs, len(documents))
doc_str_list = ["<results>"]
for doc_idx, doc in enumerate(documents[:max_docs]):
if doc_idx > 0:
doc_str_list.append("")
doc_str_list.extend([f"Document: {doc_idx}", doc["title"], doc["text"]])
doc_str_list.append("</results>")
return "\n".join(doc_str_list)

def _get_cohere_chat_history_template(
self, chat_history: List[Dict[str, str]]
) -> str:
chat_hist_str = ""
for turn in chat_history:
chat_hist_str += "<|START_OF_TURN_TOKEN|>"
if turn["role"] == "user":
chat_hist_str += "<|USER_TOKEN|>"
elif turn["role"] == "chatbot":
chat_hist_str += "<|CHATBOT_TOKEN|>"
else: # role == system
chat_hist_str += "<|SYSTEM_TOKEN|>"
chat_hist_str += turn["message"]
chat_hist_str += "<|END_OF_TURN_TOKEN|>"
return chat_hist_str


if __name__ == "__main__":
model = LocalModelDeployment(model_path="path/to/model")

print("--- Chat Stream ---")
response = model.invoke_chat_stream(
CohereChatRequest(message="hello world", temperature=0.3)
)
for item in response:
print(item)

print("\n--- Chat ---")
response = model.invoke_chat(
CohereChatRequest(message="hello world", temperature=0.3)
)
print(response)
Loading
Loading