Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion examples/offline_inference/disaggregated_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def run_decode(prefill_done):
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == "__main__":
def main():
prefill_done = Event()
prefill_process = Process(target=run_prefill, args=(prefill_done, ))
decode_process = Process(target=run_decode, args=(prefill_done, ))
Expand All @@ -109,3 +109,7 @@ def run_decode(prefill_done):
# Terminate the prefill node when decode is finished
decode_process.join()
prefill_process.terminate()


if __name__ == "__main__":
main()
15 changes: 9 additions & 6 deletions examples/offline_inference/disaggregated_prefill_lmcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
# `naive` indicates using raw bytes of the tensor without any compression
os.environ["LMCACHE_REMOTE_SERDE"] = "naive"

prompts = [
"Hello, how are you?" * 1000,
]


def run_prefill(prefill_done, prompts):
# We use GPU 0 for prefill node.
Expand Down Expand Up @@ -106,12 +110,7 @@ def run_lmcache_server(port):
return server_proc


if __name__ == "__main__":

prompts = [
"Hello, how are you?" * 1000,
]

def main():
prefill_done = Event()
prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
decode_process = Process(target=run_decode, args=(prefill_done, prompts))
Expand All @@ -128,3 +127,7 @@ def run_lmcache_server(port):
prefill_process.terminate()
lmcache_server_process.terminate()
lmcache_server_process.wait()


if __name__ == "__main__":
main()
62 changes: 38 additions & 24 deletions examples/online_serving/cohere_rerank_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,46 @@
"""
Example of using the OpenAI entrypoint's rerank API which is compatible with
the Cohere SDK: https://github.com/cohere-ai/cohere-python
Note that `pip install cohere` is needed to run this example.

run: vllm serve BAAI/bge-reranker-base
"""
from typing import Union

import cohere
from cohere import Client, ClientV2

model = "BAAI/bge-reranker-base"

query = "What is the capital of France?"

documents = [
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
]


def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
documents: list[str]) -> dict:
return client.rerank(model=model, query=query, documents=documents)


def main():
# cohere v1 client
cohere_v1 = cohere.Client(base_url="http://localhost:8000",
api_key="sk-fake-key")
rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
print("-" * 50)
print("rerank_v1_result:\n", rerank_v1_result)
print("-" * 50)

# or the v2
cohere_v2 = cohere.ClientV2("sk-fake-key",
base_url="http://localhost:8000")
rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
print("rerank_v2_result:\n", rerank_v2_result)
print("-" * 50)


# cohere v1 client
co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
rerank_v1_result = co.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])

print(rerank_v1_result)

# or the v2
co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")

v2_rerank_result = co2.rerank(
model="BAAI/bge-reranker-base",
query="What is the capital of France?",
documents=[
"The capital of France is Paris", "Reranking is fun!",
"vLLM is an open-source framework for fast AI serving"
])

print(v2_rerank_result)
if __name__ == "__main__":
main()
25 changes: 16 additions & 9 deletions examples/online_serving/jinaai_rerank_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,19 @@
"The capital of France is Paris.", "Horses and cows are both animals"
]
}
response = requests.post(url, headers=headers, json=data)

# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)


def main():
response = requests.post(url, headers=headers, json=data)

# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)


if __name__ == "__main__":
main()
73 changes: 42 additions & 31 deletions examples/online_serving/openai_chat_completion_client.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
# SPDX-License-Identifier: Apache-2.0

"""Example Python client for OpenAI Chat Completion using vLLM API server
NOTE: start a supported chat completion model server with `vllm serve`, e.g.
vllm serve meta-llama/Llama-2-7b-chat-hf
"""
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

chat_completion = client.chat.completions.create(
messages=[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
"role":
"assistant",
"content":
"The Los Angeles Dodgers won the World Series in 2020."
}, {
"role": "user",
"content": "Where was it played?"
}],
model=model,
)

print("Chat completion results:")
print(chat_completion)
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Who won the world series in 2020?"
}, {
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020."
}, {
"role": "user",
"content": "Where was it played?"
}]


def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

chat_completion = client.chat.completions.create(
messages=messages,
model=model,
)

print("-" * 50)
print("Chat completion results:")
print(chat_completion)
print("-" * 50)


if __name__ == "__main__":
main()