diff --git a/docs/source/features/prompt_embeds.md b/docs/source/features/prompt_embeds.md index 4e4648d171d5..9d7b242bbe51 100644 --- a/docs/source/features/prompt_embeds.md +++ b/docs/source/features/prompt_embeds.md @@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples: -```python -from vllm import LLM -import transformers - -model_name = "meta-llama/Llama-3.2-1B-Instruct" - -# Transformers -tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) -transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) - -llm = LLM(model=model_name, enable_prompt_embeds=True) - -# Refer to the HuggingFace repo for the correct format to use -chat = [{"role": "user", "content": "Please tell me about the capital of France."}] -token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') - -embedding_layer = transformers_model.get_input_embeddings() -prompt_embeds = embedding_layer(token_ids).squeeze(0) - -# Single prompt inference -outputs = llm.generate({ - "prompt_embeds": prompt_embeds, -}) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -# Batch inference - -chats = [ - [{"role": "user", "content": "Please tell me about the capital of France."}], - [{"role": "user", "content": "When is the day longest during the year?"}], - [{"role": "user", "content": "Where is bigger, the moon or the sun?"}] -] - -token_ids_list = [ - tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats -] -prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list] - -outputs = llm.generate( - [ - { - "prompt_embeds": prompt_embeds, - } for prompt_embeds in prompt_embeds_list - ] -) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + ## Online Serving @@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \ Then, you can use the OpenAI client as follows: -```python -from openai import OpenAI -import transformers -import torch - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -model_name = "meta-llama/Llama-3.2-1B-Instruct" - -# Transformers -tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) -transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) - - -# Refer to the HuggingFace repo for the correct format to use -chat = [{"role": "user", "content": "Please tell me about the capital of France."}] -token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') - -embedding_layer = transformers_model.get_input_embeddings() -prompt_embeds = embedding_layer(token_ids).squeeze(0) - -# Prompt embeddings -buffer = io.BytesIO() -torch.save(prompt_embeds, buffer) -buffer.seek(0) -binary_data = buffer.read() -encoded_embeds = base64.b64encode(binary_data).decode('utf-8') - - -completion = client_with_prompt_embeds.completions.create( - model=model_name, - # NOTE: The OpenAI client does not allow `None` as an input to - # `prompt`. Use an empty string if you have no text prompts. - prompt="", - max_tokens=5, - temperature=0.0, - # NOTE: The OpenAI client allows passing in extra JSON body via the - # `extra_body` argument. - extra_body={"prompt_embeds": encoded_embeds} -) - -print(completion.choices[0].text) -``` + diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py new file mode 100644 index 000000000000..99c5a682fb27 --- /dev/null +++ b/examples/offline_inference/prompt_embed_inference.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Demonstrates how to generate prompt embeddings using +Hugging Face Transformers and use them as input to vLLM +for both single and batch inference. + +Model: meta-llama/Llama-3.2-1B-Instruct +Note: This model is gated on Hugging Face Hub. + You must request access to use it: + https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct + +Requirements: +- vLLM +- transformers + +Run: + python examples/offline_inference/prompt_embed_inference.py +""" + +import torch +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizer) + +from vllm import LLM + + +def init_tokenizer_and_llm(model_name: str): + tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = AutoModelForCausalLM.from_pretrained(model_name) + embedding_layer = transformers_model.get_input_embeddings() + llm = LLM(model=model_name, enable_prompt_embeds=True) + return tokenizer, embedding_layer, llm + + +def get_prompt_embeds(chat: list[dict[str, + str]], tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + token_ids = tokenizer.apply_chat_template(chat, + add_generation_prompt=True, + return_tensors='pt') + prompt_embeds = embedding_layer(token_ids).squeeze(0) + return prompt_embeds + + +def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chat = [{ + "role": "user", + "content": "Please tell me about the capital of France." + }] + prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) + + outputs = llm.generate({ + "prompt_embeds": prompt_embeds, + }) + + print("\n[Single Inference Output]") + print("-" * 30) + for o in outputs: + print(o.outputs[0].text) + print("-" * 30) + + +def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chats = [[{ + "role": "user", + "content": "Please tell me about the capital of France." + }], + [{ + "role": "user", + "content": "When is the day longest during the year?" + }], + [{ + "role": "user", + "content": "Where is bigger, the moon or the sun?" + }]] + + prompt_embeds_list = [ + get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats + ] + + outputs = llm.generate([{ + "prompt_embeds": embeds + } for embeds in prompt_embeds_list]) + + print("\n[Batch Inference Outputs]") + print("-" * 30) + for i, o in enumerate(outputs): + print(f"Q{i+1}: {chats[i][0]['content']}") + print(f"A{i+1}: {o.outputs[0].text}\n") + print("-" * 30) + + +def main(): + model_name = "meta-llama/Llama-3.2-1B-Instruct" + tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name) + single_prompt_inference(llm, tokenizer, embedding_layer) + batch_prompt_inference(llm, tokenizer, embedding_layer) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py new file mode 100644 index 000000000000..ea580f1b432b --- /dev/null +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +vLLM OpenAI-Compatible Client with Prompt Embeddings + +This script demonstrates how to: +1. Generate prompt embeddings using Hugging Face Transformers +2. Encode them in base64 format +3. Send them to a vLLM server via the OpenAI-compatible Completions API + +Run the vLLM server first: +vllm serve meta-llama/Llama-3.2-1B-Instruct \ + --task generate \ + --max-model-len 4096 \ + --enable-prompt-embeds + +Run the client: +python examples/online_serving/prompt_embed_inference_with_openai_client.py + +Model: meta-llama/Llama-3.2-1B-Instruct +Note: This model is gated on Hugging Face Hub. + You must request access to use it: + https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct + +Dependencies: +- transformers +- torch +- openai +""" +import base64 +import io + +import torch +import transformers +from openai import OpenAI + + +def main(): + client = OpenAI( + api_key="EMPTY", + base_url="http://localhost:8000/v1", + ) + + model_name = "meta-llama/Llama-3.2-1B-Instruct" + + # Transformers + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + transformers_model = transformers.AutoModelForCausalLM.from_pretrained( + model_name) + + # Refer to the HuggingFace repo for the correct format to use + chat = [{ + "role": "user", + "content": "Please tell me about the capital of France." + }] + token_ids = tokenizer.apply_chat_template(chat, + add_generation_prompt=True, + return_tensors='pt') + + embedding_layer = transformers_model.get_input_embeddings() + prompt_embeds = embedding_layer(token_ids).squeeze(0) + + # Prompt embeddings + buffer = io.BytesIO() + torch.save(prompt_embeds, buffer) + buffer.seek(0) + binary_data = buffer.read() + encoded_embeds = base64.b64encode(binary_data).decode('utf-8') + + completion = client.completions.create( + model=model_name, + # NOTE: The OpenAI client does not allow `None` as an input to + # `prompt`. Use an empty string if you have no text prompts. + prompt="", + max_tokens=5, + temperature=0.0, + # NOTE: The OpenAI client allows passing in extra JSON body via the + # `extra_body` argument. + extra_body={"prompt_embeds": encoded_embeds}) + + print("-" * 30) + print(completion.choices[0].text) + print("-" * 30) + + +if __name__ == "__main__": + main()