From 1b91750ffb50ee789dc4b510aa8e5bfa7c765fcf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 13:10:21 +0000 Subject: [PATCH 01/43] Initial implementation --- tests/entrypoints/openai/test_serving_chat.py | 36 ++-- vllm/entrypoints/openai/api_server.py | 7 +- vllm/entrypoints/openai/protocol.py | 67 +++++- vllm/entrypoints/openai/run_batch.py | 5 +- .../openai/serving_chat/__init__.py | 0 .../completions.py} | 18 +- .../openai/serving_chat/embedding.py | 194 ++++++++++++++++++ vllm/entrypoints/openai/serving_completion.py | 8 +- vllm/entrypoints/openai/serving_embedding.py | 18 +- vllm/entrypoints/openai/serving_engine.py | 3 +- 10 files changed, 312 insertions(+), 44 deletions(-) create mode 100644 vllm/entrypoints/openai/serving_chat/__init__.py rename vllm/entrypoints/openai/{serving_chat.py => serving_chat/completions.py} (98%) create mode 100644 vllm/entrypoints/openai/serving_chat/embedding.py diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index e969d33775d86..f74ade0e37892 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -6,7 +6,8 @@ from vllm.config import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_chat.completions import ( + OpenAIServingChatCompletions) from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.transformers_utils.tokenizer import get_tokenizer @@ -44,14 +45,15 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() - serving_completion = OpenAIServingChat(engine, - model_config, - BASE_MODEL_PATHS, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - lora_modules=None, - prompt_adapters=None, - request_logger=None) + serving_completion = OpenAIServingChatCompletions( + engine, + model_config, + BASE_MODEL_PATHS, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + lora_modules=None, + prompt_adapters=None, + request_logger=None) return serving_completion @@ -65,14 +67,14 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - serving_chat = OpenAIServingChat(mock_engine, - MockModelConfig(), - BASE_MODEL_PATHS, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - lora_modules=None, - prompt_adapters=None, - request_logger=None) + serving_chat = OpenAIServingChatCompletions(mock_engine, + MockModelConfig(), + BASE_MODEL_PATHS, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + lora_modules=None, + prompt_adapters=None, + request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, messages=[{ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ae44b26a6c55a..f8e21df49f7f7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -48,7 +48,8 @@ TokenizeResponse, UnloadLoraAdapterRequest) # yapf: enable -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_chat.completions import ( + OpenAIServingChatCompletions) from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath @@ -245,7 +246,7 @@ def mount_metrics(app: FastAPI): app.routes.append(metrics_route) -def chat(request: Request) -> OpenAIServingChat: +def chat(request: Request) -> OpenAIServingChatCompletions: return request.app.state.openai_serving_chat @@ -487,7 +488,7 @@ def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats - state.openai_serving_chat = OpenAIServingChat( + state.openai_serving_chat = OpenAIServingChatCompletions( engine_client, model_config, base_model_paths, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a212c0d608ddb..89bb8a85678c3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -701,6 +701,71 @@ def validate_stream_options(cls, data): return data +class ChatEmbeddingRequest(OpenAIBaseModel): + messages: List[ChatCompletionMessageParam] + model: str + encoding_format: Literal["float", "base64"] = "float" + dimensions: Optional[int] = None + user: Optional[str] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-chat-embedding-pooling-params + additional_data: Optional[Any] = None + + # doc: end-chat-embedding-pooling-params + + # doc: begin-chat-embedding-extra-params + add_generation_prompt: bool = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + continue_final_message: bool = Field( + default=False, + description= + ("If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + "This allows you to \"prefill\" part of the model's response for it. " + "Cannot be used at the same time as `add_generation_prompt`."), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)."), + ) + chat_template: Optional[str] = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one."), + ) + chat_template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the template renderer. " + "Will be accessible by the chat template."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + # doc: end-chat-embedding-extra-params + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + class EmbeddingRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings @@ -792,7 +857,7 @@ class EmbeddingResponseData(OpenAIBaseModel): class EmbeddingResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + id: str = Field(default_factory=lambda: f"embd-{random_uuid()}") object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f5249a0c447b3..80169cc8609c0 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -18,7 +18,8 @@ ChatCompletionResponse, EmbeddingResponse, ErrorResponse) # yapf: enable -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_chat.completions import ( + OpenAIServingChatCompletions) from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.usage.usage_lib import UsageContext @@ -208,7 +209,7 @@ async def main(args): request_logger = RequestLogger(max_log_len=args.max_log_len) # Create the openai serving objects. - openai_serving_chat = OpenAIServingChat( + openai_serving_chat = OpenAIServingChatCompletions( engine, model_config, base_model_paths, diff --git a/vllm/entrypoints/openai/serving_chat/__init__.py b/vllm/entrypoints/openai/serving_chat/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat/completions.py similarity index 98% rename from vllm/entrypoints/openai/serving_chat.py rename to vllm/entrypoints/openai/serving_chat/completions.py index cd2883a3b323b..9774481808327 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat/completions.py @@ -38,12 +38,12 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import iterate_with_cancellation +from vllm.utils import iterate_with_cancellation, is_list_of logger = init_logger(__name__) -class OpenAIServingChat(OpenAIServing): +class OpenAIServingChatCompletions(OpenAIServing): def __init__(self, engine_client: EngineClient, @@ -94,12 +94,12 @@ async def create_chat_completion( raw_request: Optional[Request] = None, ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse, ErrorResponse]: - """Completion API similar to OpenAI's API. + """ + Chat Completion API similar to OpenAI's API. See https://platform.openai.com/docs/api-reference/chat/create for the API specification. This API mimics the OpenAI - ChatCompletion API. - + Chat Completion API. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: @@ -176,7 +176,7 @@ async def create_chat_completion( "\"auto\" tool choice requires " "--enable-auto-tool-choice and --tool-call-parser to be set") - request_id = f"chat-{request.request_id}" + request_id = f"chatcmpl-{request.request_id}" request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: @@ -196,9 +196,9 @@ async def create_chat_completion( add_special_tokens=request.add_special_tokens, ) else: - assert isinstance(prompt, list) and isinstance( - prompt[0], int - ), "Prompt has to be either a string or a list of token ids" + # For MistralTokenizer + assert is_list_of(prompt, int), ( + "Prompt has to be either a string or a list of token ids") prompt_inputs = TextTokensPrompt( prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) diff --git a/vllm/entrypoints/openai/serving_chat/embedding.py b/vllm/entrypoints/openai/serving_chat/embedding.py new file mode 100644 index 0000000000000..d088bafd46715 --- /dev/null +++ b/vllm/entrypoints/openai/serving_chat/embedding.py @@ -0,0 +1,194 @@ +import asyncio +import time +from typing import List, Optional, Union + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import (apply_hf_chat_template, + apply_mistral_chat_template, + load_chat_template, + parse_chat_messages_futures) +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ChatEmbeddingRequest, + EmbeddingResponse, ErrorResponse) +from vllm.entrypoints.openai.serving_embedding import ( + check_embedding_mode, request_output_to_embedding_response) +from vllm.entrypoints.openai.serving_engine import (BaseModelPath, OpenAIServing, + TextTokensPrompt) +from vllm.inputs import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import EmbeddingRequestOutput +from vllm.transformers_utils.tokenizer import MistralTokenizer +from vllm.utils import iterate_with_cancellation, is_list_of, random_uuid + +logger = init_logger(__name__) + + +class OpenAIServingChatEmbedding(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + request_logger: Optional[RequestLogger], + chat_template: Optional[str], + ): + super().__init__(engine_client=engine_client, + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + request_logger=request_logger) + + self.chat_template = load_chat_template(chat_template) + + self._enabled = check_embedding_mode(model_config) + + async def create_embedding( + self, + request: ChatEmbeddingRequest, + raw_request: Optional[Request] = None, + ) -> Union[EmbeddingResponse, ErrorResponse]: + """ + Chat Embedding API, a variant of Embedding API that accepts chat conversations + which can include multi-modal data. + """ + if not self._enabled: + return self.create_error_response("Embedding API disabled") + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + encoding_format = request.encoding_format + if request.dimensions is not None: + return self.create_error_response( + "dimensions is currently not supported") + + model_name = request.model + request_id = f"chatembd-{random_uuid()}" + created_time = int(time.monotonic()) + + truncate_prompt_tokens = None + + if request.truncate_prompt_tokens is not None: + if request.truncate_prompt_tokens <= self.max_model_len: + truncate_prompt_tokens = request.truncate_prompt_tokens + else: + return self.create_error_response( + "truncate_prompt_tokens value is " + "greater than max_model_len." + " Please, select a smaller truncation size.") + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + model_config = self.model_config + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + conversation, mm_data_future = parse_chat_messages_futures( + request.messages, model_config, tokenizer) + + prompt: Union[str, List[int]] + is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) + if is_mistral_tokenizer: + prompt = apply_mistral_chat_template( + tokenizer, + messages=request.messages, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + **(request.chat_template_kwargs or {}), + ) + else: + prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + **(request.chat_template_kwargs or {}), + ) + except Exception as e: + logger.exception("Error in applying chat template from request") + return self.create_error_response(str(e)) + + try: + mm_data = await mm_data_future + except Exception as e: + logger.exception("Error in loading multi-modal data") + return self.create_error_response(str(e)) + + try: + pooling_params = request.to_pooling_params() + + if isinstance(prompt, str): + prompt_inputs = self._tokenize_prompt_input( + request, + tokenizer, + prompt, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + else: + # For MistralTokenizer + assert is_list_of(prompt, int), ( + "Prompt has to be either a string or a list of token ids") + prompt_inputs = TextTokensPrompt( + prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) + + assert prompt_inputs is not None + + self._log_inputs(request_id, + prompt_inputs, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + engine_inputs = TokensPrompt( + prompt_token_ids=prompt_inputs["prompt_token_ids"]) + if mm_data is not None: + engine_inputs["multi_modal_data"] = mm_data + + result_generator = self.engine_client.encode( + engine_inputs, + pooling_params, + request_id, + lora_request=lora_request, + priority=request.priority, + ) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + if raw_request: + result_generator = iterate_with_cancellation( + result_generator, raw_request.is_disconnected) + + # Non-streaming response + final_res: Optional[EmbeddingRequestOutput] = None + + try: + async for res in result_generator: + final_res = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + + assert final_res is not None + + try: + response = request_output_to_embedding_response( + [final_res], request_id, created_time, model_name, + encoding_format) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 56e35950410a0..7f5a0b2ea51c4 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,6 @@ import asyncio import time -from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List, - Optional) +from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional from typing import Sequence as GenericSequence from typing import Tuple, Union, cast @@ -37,11 +36,6 @@ logger = init_logger(__name__) -TypeTokenIDs = List[int] -TypeTopLogProbs = List[Optional[Dict[int, float]]] -TypeCreateLogProbsFn = Callable[ - [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs] - class OpenAIServingCompletion(OpenAIServing): diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 6c46aae2838f6..a867f803ddd7a 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -21,7 +21,16 @@ logger = init_logger(__name__) -TypeTokenIDs = List[int] + +def check_embedding_mode(model_config: ModelConfig) -> bool: + embedding_mode = model_config.task == "embedding" + + if not embedding_mode: + logger.warning("embedding_mode is False. Embedding API will not work.") + else: + logger.info("Activating the server engine with embedding enabled.") + + return embedding_mode def _get_embedding( @@ -83,15 +92,16 @@ def __init__( lora_modules=None, prompt_adapters=None, request_logger=request_logger) - self._enabled = self._check_embedding_mode( - model_config.task == "embedding") + + self._enabled = check_embedding_mode(model_config) async def create_embedding( self, request: EmbeddingRequest, raw_request: Optional[Request] = None, ) -> Union[EmbeddingResponse, ErrorResponse]: - """Completion API similar to OpenAI's API. + """ + Embedding API similar to OpenAI's API. See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index e6d2ab93d3363..c90f335f75e71 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -13,6 +13,7 @@ # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + ChatEmbeddingRequest, CompletionRequest, DetokenizeRequest, EmbeddingRequest, ErrorResponse, @@ -57,7 +58,7 @@ class LoRAModulePath: AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - EmbeddingRequest, TokenizeRequest] + ChatEmbeddingRequest, EmbeddingRequest, TokenizeRequest] class TextTokensPrompt(TypedDict): From 61e0fcf27c255dd7af770485bad94cb758a72404 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 13:10:27 +0000 Subject: [PATCH 02/43] Update docs --- docs/source/getting_started/quickstart.rst | 6 +++--- docs/source/serving/openai_compatible_server.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index f0e6cddf09ef7..55b929b4f6fd9 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep A more detailed client example can be found `here `__. -OpenAI Chat API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~ +OpenAI Chat Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. You can use the `create chat completion `_ endpoint to interact with the model: diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 413c87ab28755..3b8708a13086c 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -49,7 +49,7 @@ completion = client.chat.completions.create( ) ``` -### Extra Parameters for Chat API +### Extra Parameters for Chat Completions API The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py From c62be47b1d2d2f1e392bdc746fc91f356b7c45a0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 14:04:50 +0000 Subject: [PATCH 03/43] Cleanup --- tests/entrypoints/openai/test_basic.py | 14 +++----- tests/entrypoints/openai/test_metrics.py | 13 +++----- tests/entrypoints/openai/test_tokenization.py | 32 +++++++++++-------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index d3aea533b6db9..63aa76e67977f 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,7 +1,6 @@ from http import HTTPStatus from typing import List -import openai import pytest import pytest_asyncio import requests @@ -69,6 +68,7 @@ def server(server_args): @pytest_asyncio.fixture async def client(server): + server.url_for() async with server.get_async_client() as async_client: yield async_client @@ -83,10 +83,8 @@ async def client(server): indirect=True, ) @pytest.mark.asyncio -async def test_show_version(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/version") +async def test_show_version(server: RemoteOpenAIServer): + response = requests.get(server.url_for("version")) response.raise_for_status() assert response.json() == {"version": VLLM_VERSION} @@ -102,9 +100,7 @@ async def test_show_version(client: openai.AsyncOpenAI): indirect=True, ) @pytest.mark.asyncio -async def test_check_health(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/health") +async def test_check_health(server: RemoteOpenAIServer): + response = requests.get(server.url_for("health")) assert response.status_code == HTTPStatus.OK diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6cb74eb78cbf0..6f3808b6db142 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -4,7 +4,6 @@ import time from http import HTTPStatus -import openai import pytest import pytest_asyncio import requests @@ -79,9 +78,7 @@ async def client(server): @pytest.mark.asyncio -async def test_metrics_counts(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_counts(server: RemoteOpenAIServer): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -89,7 +86,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): prompt=_TOKENIZED_PROMPT, max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) print(response.text) assert response.status_code == HTTPStatus.OK @@ -170,16 +167,14 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_metrics_exist(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_exist(server: RemoteOpenAIServer): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK for metric in EXPECTED_METRICS: diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 859a676a9c777..b1956a8cbc9dc 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,3 @@ -import openai # use the official client for correctness check import pytest import pytest_asyncio import requests @@ -55,9 +54,11 @@ async def client(server): [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str, tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_completions( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_special_tokens": add_special, "model": model_name, @@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_chat( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_generation_prompt": add_generation, @@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_detokenize( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) - print(f"CALLING {base_url} FOR {model_name}") - response = requests.post(base_url + "/detokenize", + response = requests.post(server.url_for("detokenize"), json={ "model": model_name, "tokens": tokens From cc999b1963b9dfbb87d2540279265764d0b4422c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 14:28:48 +0000 Subject: [PATCH 04/43] Consolidate and make code consistent --- tests/entrypoints/openai/test_serving_chat.py | 3 +- vllm/entrypoints/openai/api_server.py | 3 +- vllm/entrypoints/openai/protocol.py | 79 ++++--- vllm/entrypoints/openai/run_batch.py | 3 +- .../completions.py => serving_chat.py} | 8 +- .../openai/serving_chat/__init__.py | 0 .../openai/serving_chat/embedding.py | 194 ------------------ vllm/entrypoints/openai/serving_embedding.py | 87 +++++++- vllm/entrypoints/openai/serving_engine.py | 3 +- .../openai/serving_tokenization.py | 10 +- 10 files changed, 137 insertions(+), 253 deletions(-) rename vllm/entrypoints/openai/{serving_chat/completions.py => serving_chat.py} (99%) delete mode 100644 vllm/entrypoints/openai/serving_chat/__init__.py delete mode 100644 vllm/entrypoints/openai/serving_chat/embedding.py diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index f74ade0e37892..50887011f69c5 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -6,8 +6,7 @@ from vllm.config import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.entrypoints.openai.serving_chat.completions import ( - OpenAIServingChatCompletions) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f8e21df49f7f7..9b41725fe4e93 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -48,8 +48,7 @@ TokenizeResponse, UnloadLoraAdapterRequest) # yapf: enable -from vllm.entrypoints.openai.serving_chat.completions import ( - OpenAIServingChatCompletions) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 89bb8a85678c3..85f6b39a98f07 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -701,18 +701,52 @@ def validate_stream_options(cls, data): return data -class ChatEmbeddingRequest(OpenAIBaseModel): - messages: List[ChatCompletionMessageParam] +class EmbeddingCompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/embeddings model: str + input: Union[List[int], List[List[int]], str, List[str]] encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None user: Optional[str] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - # doc: begin-chat-embedding-pooling-params + # doc: begin-embedding-pooling-params additional_data: Optional[Any] = None - # doc: end-chat-embedding-pooling-params + # doc: end-embedding-pooling-params + + # doc: begin-embedding-extra-params + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + # doc: end-embedding-extra-params + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + +class EmbeddingChatRequest(OpenAIBaseModel): + model: str + messages: List[ChatCompletionMessageParam] + + encoding_format: Literal["float", "base64"] = "float" + dimensions: Optional[int] = None + user: Optional[str] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-chat-embedding-pooling-params + additional_data: Optional[Any] = None # doc: begin-chat-embedding-extra-params add_generation_prompt: bool = Field( @@ -762,37 +796,20 @@ class ChatEmbeddingRequest(OpenAIBaseModel): # doc: end-chat-embedding-extra-params + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get( + "add_generation_prompt"): + raise ValueError("Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True.") + return data + def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) -class EmbeddingRequest(OpenAIBaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/embeddings - model: str - input: Union[List[int], List[List[int]], str, List[str]] - encoding_format: Literal["float", "base64"] = "float" - dimensions: Optional[int] = None - user: Optional[str] = None - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - - # doc: begin-embedding-pooling-params - additional_data: Optional[Any] = None - - # doc: end-embedding-pooling-params - - # doc: begin-embedding-extra-params - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling.")) - - # doc: end-embedding-extra-params - - def to_pooling_params(self): - return PoolingParams(additional_data=self.additional_data) +EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] class CompletionLogProbs(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 80169cc8609c0..2d9ec6f32994f 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -18,8 +18,7 @@ ChatCompletionResponse, EmbeddingResponse, ErrorResponse) # yapf: enable -from vllm.entrypoints.openai.serving_chat.completions import ( - OpenAIServingChatCompletions) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.usage.usage_lib import UsageContext diff --git a/vllm/entrypoints/openai/serving_chat/completions.py b/vllm/entrypoints/openai/serving_chat.py similarity index 99% rename from vllm/entrypoints/openai/serving_chat/completions.py rename to vllm/entrypoints/openai/serving_chat.py index 9774481808327..0b61c33a9d96b 100644 --- a/vllm/entrypoints/openai/serving_chat/completions.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -152,14 +152,10 @@ async def create_chat_completion( documents=request.documents, **(request.chat_template_kwargs or {}), ) - except Exception as e: - logger.exception("Error in applying chat template from request") - return self.create_error_response(str(e)) - - try: + mm_data = await mm_data_future except Exception as e: - logger.exception("Error in loading multi-modal data") + logger.exception("Error in applying chat template from request") return self.create_error_response(str(e)) # validation for OpenAI tools diff --git a/vllm/entrypoints/openai/serving_chat/__init__.py b/vllm/entrypoints/openai/serving_chat/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/entrypoints/openai/serving_chat/embedding.py b/vllm/entrypoints/openai/serving_chat/embedding.py deleted file mode 100644 index d088bafd46715..0000000000000 --- a/vllm/entrypoints/openai/serving_chat/embedding.py +++ /dev/null @@ -1,194 +0,0 @@ -import asyncio -import time -from typing import List, Optional, Union - -from fastapi import Request - -from vllm.config import ModelConfig -from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) -from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import (ChatEmbeddingRequest, - EmbeddingResponse, ErrorResponse) -from vllm.entrypoints.openai.serving_embedding import ( - check_embedding_mode, request_output_to_embedding_response) -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, OpenAIServing, - TextTokensPrompt) -from vllm.inputs import TokensPrompt -from vllm.logger import init_logger -from vllm.outputs import EmbeddingRequestOutput -from vllm.transformers_utils.tokenizer import MistralTokenizer -from vllm.utils import iterate_with_cancellation, is_list_of, random_uuid - -logger = init_logger(__name__) - - -class OpenAIServingChatEmbedding(OpenAIServing): - - def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - base_model_paths: List[BaseModelPath], - *, - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - ): - super().__init__(engine_client=engine_client, - model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, - request_logger=request_logger) - - self.chat_template = load_chat_template(chat_template) - - self._enabled = check_embedding_mode(model_config) - - async def create_embedding( - self, - request: ChatEmbeddingRequest, - raw_request: Optional[Request] = None, - ) -> Union[EmbeddingResponse, ErrorResponse]: - """ - Chat Embedding API, a variant of Embedding API that accepts chat conversations - which can include multi-modal data. - """ - if not self._enabled: - return self.create_error_response("Embedding API disabled") - error_check_ret = await self._check_model(request) - if error_check_ret is not None: - return error_check_ret - - encoding_format = request.encoding_format - if request.dimensions is not None: - return self.create_error_response( - "dimensions is currently not supported") - - model_name = request.model - request_id = f"chatembd-{random_uuid()}" - created_time = int(time.monotonic()) - - truncate_prompt_tokens = None - - if request.truncate_prompt_tokens is not None: - if request.truncate_prompt_tokens <= self.max_model_len: - truncate_prompt_tokens = request.truncate_prompt_tokens - else: - return self.create_error_response( - "truncate_prompt_tokens value is " - "greater than max_model_len." - " Please, select a smaller truncation size.") - - try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - - model_config = self.model_config - tokenizer = await self.engine_client.get_tokenizer(lora_request) - - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) - - prompt: Union[str, List[int]] - is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) - if is_mistral_tokenizer: - prompt = apply_mistral_chat_template( - tokenizer, - messages=request.messages, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - **(request.chat_template_kwargs or {}), - ) - else: - prompt = apply_hf_chat_template( - tokenizer, - conversation=conversation, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - **(request.chat_template_kwargs or {}), - ) - except Exception as e: - logger.exception("Error in applying chat template from request") - return self.create_error_response(str(e)) - - try: - mm_data = await mm_data_future - except Exception as e: - logger.exception("Error in loading multi-modal data") - return self.create_error_response(str(e)) - - try: - pooling_params = request.to_pooling_params() - - if isinstance(prompt, str): - prompt_inputs = self._tokenize_prompt_input( - request, - tokenizer, - prompt, - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - ) - else: - # For MistralTokenizer - assert is_list_of(prompt, int), ( - "Prompt has to be either a string or a list of token ids") - prompt_inputs = TextTokensPrompt( - prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) - - assert prompt_inputs is not None - - self._log_inputs(request_id, - prompt_inputs, - params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) - - engine_inputs = TokensPrompt( - prompt_token_ids=prompt_inputs["prompt_token_ids"]) - if mm_data is not None: - engine_inputs["multi_modal_data"] = mm_data - - result_generator = self.engine_client.encode( - engine_inputs, - pooling_params, - request_id, - lora_request=lora_request, - priority=request.priority, - ) - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - - if raw_request: - result_generator = iterate_with_cancellation( - result_generator, raw_request.is_disconnected) - - # Non-streaming response - final_res: Optional[EmbeddingRequestOutput] = None - - try: - async for res in result_generator: - final_res = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - - assert final_res is not None - - try: - response = request_output_to_embedding_response( - [final_res], request_id, created_time, model_name, - encoding_format) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - - return response diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index a867f803ddd7a..296a583afa30a 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -9,15 +9,24 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import (apply_hf_chat_template, + apply_mistral_chat_template, + load_chat_template, + parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import (EmbeddingRequest, +from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, + EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import (BaseModelPath, + OpenAIServing, + TextTokensPrompt) +from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.outputs import EmbeddingOutput, EmbeddingRequestOutput -from vllm.utils import merge_async_iterators, random_uuid +from vllm.transformers_utils.tokenizer import MistralTokenizer +from vllm.utils import is_list_of, merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -85,6 +94,7 @@ def __init__( base_model_paths: List[BaseModelPath], *, request_logger: Optional[RequestLogger], + chat_template: Optional[str], ): super().__init__(engine_client=engine_client, model_config=model_config, @@ -93,6 +103,13 @@ def __init__( prompt_adapters=None, request_logger=request_logger) + # If this is None we use the tokenizer's default chat template + # the list of commonly-used chat template names for HF named templates + hf_chat_templates: List[str] = ['default', 'tool_use'] + self.chat_template = chat_template \ + if chat_template in hf_chat_templates \ + else load_chat_template(chat_template) + self._enabled = check_embedding_mode(model_config) async def create_embedding( @@ -142,12 +159,64 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) - pooling_params = request.to_pooling_params() + prompt: Union[str, List[int]] + if isinstance(request, EmbeddingChatRequest): + model_config = self.model_config + + conversation, mm_data_future = parse_chat_messages_futures( + request.messages, model_config, tokenizer) + + if isinstance(tokenizer, MistralTokenizer): + prompt = apply_mistral_chat_template( + tokenizer, + messages=request.messages, + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + ) + else: + prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + ) + + prompts = [prompt] + + mm_data = await mm_data_future + else: + prompts = list( + self._tokenize_prompt_input_or_inputs( + request, tokenizer, request.input, + truncate_prompt_tokens)) + + if isinstance(prompt, str): + prompt_inputs = self._tokenize_prompt_input( + request, + tokenizer, + prompt, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + else: + # For MistralTokenizer + assert is_list_of(prompt, int), ( + "Prompt has to be either a string or a list of token ids") + prompt_inputs = TextTokensPrompt( + prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) + + engine_inputs = TokensPrompt( + prompt_token_ids=prompt_inputs["prompt_token_ids"]) + if mm_data is not None: + engine_inputs["multi_modal_data"] = mm_data + except ValueError as e: + logger.exception("Error in applying extracting prompt inputs") + return self.create_error_response(str(e)) - prompts = list( - self._tokenize_prompt_input_or_inputs(request, tokenizer, - request.input, - truncate_prompt_tokens)) + try: + pooling_params = request.to_pooling_params() for i, prompt_inputs in enumerate(prompts): request_id_item = f"{request_id}-{i}" @@ -164,7 +233,7 @@ async def create_embedding( "for embedding models") generator = self.engine_client.encode( - {"prompt_token_ids": prompt_inputs["prompt_token_ids"]}, + engine_inputs, pooling_params, request_id_item, lora_request=lora_request, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c90f335f75e71..e6d2ab93d3363 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -13,7 +13,6 @@ # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - ChatEmbeddingRequest, CompletionRequest, DetokenizeRequest, EmbeddingRequest, ErrorResponse, @@ -58,7 +57,7 @@ class LoRAModulePath: AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - ChatEmbeddingRequest, EmbeddingRequest, TokenizeRequest] + EmbeddingRequest, TokenizeRequest] class TextTokensPrompt(TypedDict): diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index a269c94c7ec0d..022715cda2316 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -76,11 +76,6 @@ async def create_tokenize( conversation, mm_data_future = parse_chat_messages_futures( request.messages, model_config, tokenizer) - mm_data = await mm_data_future - if mm_data: - logger.warning( - "Multi-modal inputs are ignored during tokenization") - if isinstance(tokenizer, MistralTokenizer): prompt = apply_mistral_chat_template( tokenizer, @@ -97,6 +92,11 @@ async def create_tokenize( add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, ) + + mm_data = await mm_data_future + if mm_data: + logger.warning( + "Multi-modal inputs are ignored during tokenization") else: prompt = request.prompt From 9ed87c19e0badbde746be9cb9b360a3802e8726b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 14:29:35 +0000 Subject: [PATCH 05/43] Remove useless statement --- tests/entrypoints/openai/test_basic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 63aa76e67977f..4616f363cc04a 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -68,7 +68,6 @@ def server(server_args): @pytest_asyncio.fixture async def client(server): - server.url_for() async with server.get_async_client() as async_client: yield async_client From efa7c6f53d901994ef518a345e9bed0c61c677ed Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 14:30:32 +0000 Subject: [PATCH 06/43] Rename back --- tests/entrypoints/openai/test_serving_chat.py | 35 +++++++++---------- vllm/entrypoints/openai/api_server.py | 6 ++-- vllm/entrypoints/openai/run_batch.py | 4 +-- vllm/entrypoints/openai/serving_chat.py | 6 ++-- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 50887011f69c5..e969d33775d86 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -6,7 +6,7 @@ from vllm.config import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.transformers_utils.tokenizer import get_tokenizer @@ -44,15 +44,14 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() - serving_completion = OpenAIServingChatCompletions( - engine, - model_config, - BASE_MODEL_PATHS, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - lora_modules=None, - prompt_adapters=None, - request_logger=None) + serving_completion = OpenAIServingChat(engine, + model_config, + BASE_MODEL_PATHS, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + lora_modules=None, + prompt_adapters=None, + request_logger=None) return serving_completion @@ -66,14 +65,14 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False - serving_chat = OpenAIServingChatCompletions(mock_engine, - MockModelConfig(), - BASE_MODEL_PATHS, - response_role="assistant", - chat_template=CHAT_TEMPLATE, - lora_modules=None, - prompt_adapters=None, - request_logger=None) + serving_chat = OpenAIServingChat(mock_engine, + MockModelConfig(), + BASE_MODEL_PATHS, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + lora_modules=None, + prompt_adapters=None, + request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, messages=[{ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9b41725fe4e93..ae44b26a6c55a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -48,7 +48,7 @@ TokenizeResponse, UnloadLoraAdapterRequest) # yapf: enable -from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath @@ -245,7 +245,7 @@ def mount_metrics(app: FastAPI): app.routes.append(metrics_route) -def chat(request: Request) -> OpenAIServingChatCompletions: +def chat(request: Request) -> OpenAIServingChat: return request.app.state.openai_serving_chat @@ -487,7 +487,7 @@ def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats - state.openai_serving_chat = OpenAIServingChatCompletions( + state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, base_model_paths, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 2d9ec6f32994f..f5249a0c447b3 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -18,7 +18,7 @@ ChatCompletionResponse, EmbeddingResponse, ErrorResponse) # yapf: enable -from vllm.entrypoints.openai.serving_chat import OpenAIServingChatCompletions +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.usage.usage_lib import UsageContext @@ -208,7 +208,7 @@ async def main(args): request_logger = RequestLogger(max_log_len=args.max_log_len) # Create the openai serving objects. - openai_serving_chat = OpenAIServingChatCompletions( + openai_serving_chat = OpenAIServingChat( engine, model_config, base_model_paths, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 0b61c33a9d96b..05c34980af350 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -38,12 +38,12 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import iterate_with_cancellation, is_list_of +from vllm.utils import is_list_of, iterate_with_cancellation logger = init_logger(__name__) -class OpenAIServingChatCompletions(OpenAIServing): +class OpenAIServingChat(OpenAIServing): def __init__(self, engine_client: EngineClient, @@ -152,7 +152,7 @@ async def create_chat_completion( documents=request.documents, **(request.chat_template_kwargs or {}), ) - + mm_data = await mm_data_future except Exception as e: logger.exception("Error in applying chat template from request") From ab9297eb9ea511990fd2b3514b513dfe4defd5b7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 28 Oct 2024 15:05:50 +0000 Subject: [PATCH 07/43] Factor out common code --- vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/run_batch.py | 1 + vllm/entrypoints/openai/serving_embedding.py | 87 +++--------- vllm/entrypoints/openai/serving_engine.py | 138 +++++++++++++++++-- 4 files changed, 151 insertions(+), 76 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ae44b26a6c55a..1229435cf782b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -513,6 +513,7 @@ def init_app_state( model_config, base_model_paths, request_logger=request_logger, + chat_template=args.chat_template, ) state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f5249a0c447b3..41b9d92f1166d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -223,6 +223,7 @@ async def main(args): model_config, base_model_paths, request_logger=request_logger, + chat_template=None, ) tracker = BatchProgressTracker() diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 296a583afa30a..2a79e8775d528 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -9,24 +9,17 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - OpenAIServing, - TextTokensPrompt) -from vllm.inputs import TokensPrompt +from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.logger import init_logger from vllm.outputs import EmbeddingOutput, EmbeddingRequestOutput -from vllm.transformers_utils.tokenizer import MistralTokenizer -from vllm.utils import is_list_of, merge_async_iterators, random_uuid +from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -140,15 +133,6 @@ async def create_embedding( truncate_prompt_tokens = None - if request.truncate_prompt_tokens is not None: - if request.truncate_prompt_tokens <= self.max_model_len: - truncate_prompt_tokens = request.truncate_prompt_tokens - else: - return self.create_error_response( - "truncate_prompt_tokens value is " - "greater than max_model_len." - " Please, select a smaller truncation size.") - # Schedule the request and get the result generator. generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] try: @@ -159,58 +143,25 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) - prompt: Union[str, List[int]] if isinstance(request, EmbeddingChatRequest): - model_config = self.model_config - - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) - - if isinstance(tokenizer, MistralTokenizer): - prompt = apply_mistral_chat_template( - tokenizer, - messages=request.messages, - chat_template=self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - ) - else: - prompt = apply_hf_chat_template( - tokenizer, - conversation=conversation, - chat_template=self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - ) - - prompts = [prompt] - - mm_data = await mm_data_future - else: - prompts = list( - self._tokenize_prompt_input_or_inputs( - request, tokenizer, request.input, - truncate_prompt_tokens)) - - if isinstance(prompt, str): - prompt_inputs = self._tokenize_prompt_input( + request_prompts, engine_prompts = await self._parse_chat_inputs( request, tokenizer, - prompt, + request.messages, + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) else: - # For MistralTokenizer - assert is_list_of(prompt, int), ( - "Prompt has to be either a string or a list of token ids") - prompt_inputs = TextTokensPrompt( - prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) - - engine_inputs = TokensPrompt( - prompt_token_ids=prompt_inputs["prompt_token_ids"]) - if mm_data is not None: - engine_inputs["multi_modal_data"] = mm_data + request_prompts, engine_prompts = self._parse_completion_inputs( + request, + tokenizer, + request.input, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) except ValueError as e: logger.exception("Error in applying extracting prompt inputs") return self.create_error_response(str(e)) @@ -218,11 +169,11 @@ async def create_embedding( try: pooling_params = request.to_pooling_params() - for i, prompt_inputs in enumerate(prompts): + for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" self._log_inputs(request_id_item, - prompt_inputs, + request_prompts[i], params=pooling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) @@ -233,7 +184,7 @@ async def create_embedding( "for embedding models") generator = self.engine_client.encode( - engine_inputs, + engine_prompt, pooling_params, request_id_item, lora_request=lora_request, @@ -252,7 +203,7 @@ async def create_embedding( # Non-streaming response final_res_batch: List[Optional[EmbeddingRequestOutput]] - final_res_batch = [None] * len(prompts) + final_res_batch = [None] * len(engine_prompt) try: async for i, res in result_generator: final_res_batch[i] = res diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index e6d2ab93d3363..afbf69d68e8d4 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -2,28 +2,36 @@ import pathlib from dataclasses import dataclass from http import HTTPStatus -from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, + Sequence, Tuple, TypedDict, Union) from pydantic import Field from typing_extensions import Annotated from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + apply_hf_chat_template, + apply_mistral_chat_template, + parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - EmbeddingRequest, ErrorResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, + ErrorResponse, LoadLoraAdapterRequest, ModelCard, ModelList, ModelPermission, TokenizeChatRequest, TokenizeCompletionRequest, - TokenizeRequest, UnloadLoraAdapterRequest) +from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable +from vllm.inputs import TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -31,8 +39,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import AtomicCounter +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import AtomicCounter, is_list_of logger = init_logger(__name__) @@ -56,8 +64,14 @@ class LoRAModulePath: base_model_name: Optional[str] = None -AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - EmbeddingRequest, TokenizeRequest] +CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, + EmbeddingCompletionRequest, + TokenizeCompletionRequest] + +ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, + TokenizeChatRequest] + +AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest] class TextTokensPrompt(TypedDict): @@ -65,6 +79,9 @@ class TextTokensPrompt(TypedDict): prompt_token_ids: List[int] +RequestPrompt = Union[List[int], str, TextTokensPrompt] + + class OpenAIServing: def __init__( @@ -246,7 +263,8 @@ def _validate_input( token_num = len(input_ids) # Note: EmbeddingRequest doesn't have max_tokens - if isinstance(request, EmbeddingRequest): + if isinstance(request, + (EmbeddingChatRequest, EmbeddingCompletionRequest)): if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " @@ -367,6 +385,110 @@ def _tokenize_prompt_input_or_inputs( truncate_prompt_tokens=truncate_prompt_tokens, ) + def _parse_completion_inputs( + self, + request: CompletionLikeRequest, + tokenizer: AnyTokenizer, + input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + add_special_tokens: bool = True, + ) -> Tuple[Sequence[RequestPrompt], List[TokensPrompt]]: + request_prompts = [ + request_prompt + for request_prompt in self._tokenize_prompt_input_or_inputs( + request, + tokenizer, + input_or_inputs, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + ) + ] + + engine_prompts = [ + TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"]) + for request_prompt in request_prompts + ] + + return request_prompts, engine_prompts + + async def _parse_chat_inputs( + self, + request: ChatLikeRequest, + tokenizer: AnyTokenizer, + messages: List[ChatCompletionMessageParam], + chat_template: Optional[str] = None, + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tool_dicts: Optional[List[Dict[str, Any]]] = None, + documents: Optional[List[Dict[str, str]]] = None, + chat_template_kwargs: Optional[Dict[str, Any]] = None, + tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + add_special_tokens: bool = False, + ) -> Tuple[Sequence[RequestPrompt], List[TokensPrompt]]: + conversation, mm_data_future = parse_chat_messages_futures( + messages, + self.model_config, + tokenizer, + ) + + request_prompt: Union[str, List[int]] + is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) + if is_mistral_tokenizer: + request_prompt = apply_mistral_chat_template( + tokenizer, + messages=messages, + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tool_dicts, + documents=documents, + **(chat_template_kwargs or {}), + ) + else: + request_prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tool_dicts, + documents=documents, + **(chat_template_kwargs or {}), + ) + + mm_data = await mm_data_future + + if tool_parser is not None: + if not isinstance(request, ChatCompletionRequest): + msg = "Tool usage is only supported for Chat Completions API" + raise NotImplementedError(msg) + + request = tool_parser(tokenizer).adjust_request(request=request) + + if isinstance(request_prompt, str): + prompt_inputs = self._tokenize_prompt_input( + request, + tokenizer, + request_prompt, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + ) + else: + # For MistralTokenizer + assert is_list_of(request_prompt, int), ( + "Prompt has to be either a string or a list of token ids") + prompt_inputs = TextTokensPrompt( + prompt=tokenizer.decode(request_prompt), + prompt_token_ids=request_prompt) + + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["prompt_token_ids"]) + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + + return [request_prompt], [engine_prompt] + def _log_inputs( self, request_id: str, From 5a4f2719b92215183caef8b9de62186672a64449 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:23:36 +0000 Subject: [PATCH 08/43] Reinstate truncate_prompt_tokens check --- vllm/entrypoints/openai/serving_embedding.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 2a79e8775d528..3dfa74f35043e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -133,6 +133,15 @@ async def create_embedding( truncate_prompt_tokens = None + if request.truncate_prompt_tokens is not None: + if request.truncate_prompt_tokens <= self.max_model_len: + truncate_prompt_tokens = request.truncate_prompt_tokens + else: + return self.create_error_response( + "truncate_prompt_tokens value is " + "greater than max_model_len." + " Please, select a smaller truncation size.") + # Schedule the request and get the result generator. generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] try: From 4a969b4b63df510355b98767230d82c566c825bd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:34:52 +0000 Subject: [PATCH 09/43] Rename --- vllm/entrypoints/openai/serving_embedding.py | 10 +++++----- vllm/entrypoints/openai/serving_engine.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 3dfa74f35043e..f1d61563e4d2f 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -142,8 +142,6 @@ async def create_embedding( "greater than max_model_len." " Please, select a smaller truncation size.") - # Schedule the request and get the result generator. - generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] try: ( lora_request, @@ -153,7 +151,7 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) if isinstance(request, EmbeddingChatRequest): - request_prompts, engine_prompts = await self._parse_chat_inputs( + request_prompts, engine_prompts = await self._preprocess_chat( request, tokenizer, request.messages, @@ -164,7 +162,7 @@ async def create_embedding( add_special_tokens=request.add_special_tokens, ) else: - request_prompts, engine_prompts = self._parse_completion_inputs( + request_prompts, engine_prompts = self._preprocess_completion( request, tokenizer, request.input, @@ -172,9 +170,11 @@ async def create_embedding( add_special_tokens=request.add_special_tokens, ) except ValueError as e: - logger.exception("Error in applying extracting prompt inputs") + logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index afbf69d68e8d4..c90d4a480f5ec 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -385,7 +385,7 @@ def _tokenize_prompt_input_or_inputs( truncate_prompt_tokens=truncate_prompt_tokens, ) - def _parse_completion_inputs( + def _preprocess_completion( self, request: CompletionLikeRequest, tokenizer: AnyTokenizer, @@ -411,7 +411,7 @@ def _parse_completion_inputs( return request_prompts, engine_prompts - async def _parse_chat_inputs( + async def _preprocess_chat( self, request: ChatLikeRequest, tokenizer: AnyTokenizer, From 279b9ce865116c787061963afb264cf9ff145e1a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:36:02 +0000 Subject: [PATCH 10/43] Fix --- vllm/entrypoints/openai/serving_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index f1d61563e4d2f..200a40e9cdd0e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -212,7 +212,7 @@ async def create_embedding( # Non-streaming response final_res_batch: List[Optional[EmbeddingRequestOutput]] - final_res_batch = [None] * len(engine_prompt) + final_res_batch = [None] * len(request_prompts) try: async for i, res in result_generator: final_res_batch[i] = res From 7de803fa43e17e371e235d85253737648971c6c7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:36:25 +0000 Subject: [PATCH 11/43] Remove unused code --- vllm/entrypoints/openai/serving_embedding.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 200a40e9cdd0e..481346116efb8 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -233,11 +233,3 @@ async def create_embedding( return self.create_error_response(str(e)) return response - - def _check_embedding_mode(self, embedding_mode: bool) -> bool: - if not embedding_mode: - logger.warning( - "embedding_mode is False. Embedding API will not work.") - else: - logger.info("Activating the server engine with embedding enabled.") - return embedding_mode From c1ef363116f0a59419346230913716bea68af266 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:51:00 +0000 Subject: [PATCH 12/43] Migrate tokenization API --- .../openai/serving_tokenization.py | 74 ++++++++----------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 022715cda2316..c0bf0631c64ab 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -62,59 +62,48 @@ async def create_tokenize( request_id = f"tokn-{random_uuid()}" - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - - tokenizer = await self.engine_client.get_tokenizer(lora_request) + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) - prompt: Union[str, List[int]] - if isinstance(request, TokenizeChatRequest): - model_config = self.model_config + tokenizer = await self.engine_client.get_tokenizer(lora_request) - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) - - if isinstance(tokenizer, MistralTokenizer): - prompt = apply_mistral_chat_template( + if isinstance(request, TokenizeChatRequest): + request_prompts, engine_prompts = await self._preprocess_chat( + request, tokenizer, - messages=request.messages, + request.messages, chat_template=self.chat_template, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, + add_special_tokens=request.add_special_tokens, ) else: - prompt = apply_hf_chat_template( + request_prompts, engine_prompts = self._preprocess_completion( + request, tokenizer, - conversation=conversation, - chat_template=self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, + request.prompt, + add_special_tokens=request.add_special_tokens, ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + - mm_data = await mm_data_future - if mm_data: - logger.warning( - "Multi-modal inputs are ignored during tokenization") - else: - prompt = request.prompt + input_ids: List[int] = [] + for i, engine_prompt in enumerate(engine_prompts): + self._log_inputs(request_id, + request_prompts[i], + params=None, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) - self._log_inputs(request_id, - prompt, - params=None, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) - - # Silently ignore prompt adapter since it does not affect tokenization + # Silently ignore prompt adapter since it does not affect tokenization + # (Unlike in Embeddings API where an error is raised) - prompt_input = self._tokenize_prompt_input( - request, - tokenizer, - prompt, - add_special_tokens=request.add_special_tokens, - ) - input_ids = prompt_input["prompt_token_ids"] + input_ids.extend(engine_prompt["prompt_token_ids"]) return TokenizeResponse(tokens=input_ids, count=len(input_ids), @@ -143,9 +132,8 @@ async def create_detokenize( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - if prompt_adapter_request is not None: - raise NotImplementedError("Prompt adapter is not supported " - "for tokenization") + # Silently ignore prompt adapter since it does not affect tokenization + # (Unlike in Embeddings API where an error is raised) prompt_input = self._tokenize_prompt_input( request, From a10fa85f64b9074dda357d2d56430e9ba076be0f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:51:05 +0000 Subject: [PATCH 13/43] Some fixes --- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 481346116efb8..1899bd18b370a 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -155,7 +155,7 @@ async def create_embedding( request, tokenizer, request.messages, - chat_template=self.chat_template, + chat_template=request.chat_template or self.chat_template, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, truncate_prompt_tokens=truncate_prompt_tokens, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c90d4a480f5ec..a9d073fa5d029 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -492,7 +492,7 @@ async def _preprocess_chat( def _log_inputs( self, request_id: str, - inputs: Union[str, List[int], TextTokensPrompt], + inputs: RequestPrompt, params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], From 89e0710aa813fb765102ed8995d4957fd7c38475 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:52:56 +0000 Subject: [PATCH 14/43] format --- vllm/entrypoints/openai/serving_tokenization.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index c0bf0631c64ab..9a5276af7d5ad 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -90,7 +90,6 @@ async def create_tokenize( except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - input_ids: List[int] = [] for i, engine_prompt in enumerate(engine_prompts): @@ -100,8 +99,8 @@ async def create_tokenize( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - # Silently ignore prompt adapter since it does not affect tokenization - # (Unlike in Embeddings API where an error is raised) + # Silently ignore prompt adapter since it does not affect + # tokenization (Unlike in Embeddings API where an error is raised) input_ids.extend(engine_prompt["prompt_token_ids"]) From 81b94ded6c0b99891618d3464645db2d47c5dd8b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 02:53:16 +0000 Subject: [PATCH 15/43] remoev unused imports --- vllm/entrypoints/openai/serving_tokenization.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 9a5276af7d5ad..b2019b2425acd 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -2,10 +2,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -20,7 +17,6 @@ LoRAModulePath, OpenAIServing) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.utils import random_uuid logger = init_logger(__name__) From a79d3b2805b203eed58233281a282a3bdc9af4ec Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 12:33:13 +0000 Subject: [PATCH 16/43] Migrate chat and completion APIs --- vllm/entrypoints/openai/serving_chat.py | 211 ++++++++---------- vllm/entrypoints/openai/serving_completion.py | 41 ++-- vllm/entrypoints/openai/serving_embedding.py | 10 +- vllm/entrypoints/openai/serving_engine.py | 8 +- .../openai/serving_tokenization.py | 6 +- 5 files changed, 130 insertions(+), 146 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 05c34980af350..7440a82bd947d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -10,11 +10,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (ConversationMessage, - apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -27,10 +23,8 @@ from vllm.entrypoints.openai.serving_engine import (BaseModelPath, LoRAModulePath, OpenAIServing, - PromptAdapterPath, - TextTokensPrompt) + PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager -from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams @@ -38,7 +32,7 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import is_list_of, iterate_with_cancellation +from vllm.utils import iterate_with_cancellation logger = init_logger(__name__) @@ -118,138 +112,111 @@ async def create_chat_completion( prompt_adapter_request, ) = self._maybe_get_adapters(request) - model_config = self.model_config tokenizer = await self.engine_client.get_tokenizer(lora_request) - - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) + tool_parser = self.tool_parser + + # validation for OpenAI tools + # tool_choice = "required" is not supported + if request.tool_choice == "required": + return self.create_error_response( + "tool_choice = \"required\" is not supported!") + + if (request.tool_choice == "auto" and + not (self.enable_auto_tools and tool_parser is not None) + and not isinstance(tokenizer, MistralTokenizer)): + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + "\"auto\" tool choice requires " + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) tool_dicts = None if request.tools is None else [ tool.model_dump() for tool in request.tools ] - prompt: Union[str, List[int]] - is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) - if is_mistral_tokenizer: - prompt = apply_mistral_chat_template( - tokenizer, - messages=request.messages, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tools=tool_dicts, - documents=request.documents, - **(request.chat_template_kwargs or {}), - ) - else: - prompt = apply_hf_chat_template( - tokenizer, - conversation=conversation, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tools=tool_dicts, - documents=request.documents, - **(request.chat_template_kwargs or {}), - ) - - mm_data = await mm_data_future - except Exception as e: - logger.exception("Error in applying chat template from request") + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + tool_dicts=tool_dicts, + documents=request.documents, + chat_template_kwargs=request.chat_template_kwargs, + tool_parser=tool_parser, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - # validation for OpenAI tools - # tool_choice = "required" is not supported - if request.tool_choice == "required": - return self.create_error_response( - "tool_choice = \"required\" is not supported!") - - if not is_mistral_tokenizer and request.tool_choice == "auto" and not ( - self.enable_auto_tools and self.tool_parser is not None): - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - "\"auto\" tool choice requires " - "--enable-auto-tool-choice and --tool-call-parser to be set") - request_id = f"chatcmpl-{request.request_id}" request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: raw_request.state.request_metadata = request_metadata + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[RequestOutput, None]] = [] try: - if self.enable_auto_tools and self.tool_parser: - request = self.tool_parser(tokenizer).adjust_request( - request=request) - - if isinstance(prompt, str): - prompt_inputs = self._tokenize_prompt_input( - request, - tokenizer, - prompt, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - ) - else: - # For MistralTokenizer - assert is_list_of(prompt, int), ( - "Prompt has to be either a string or a list of token ids") - prompt_inputs = TextTokensPrompt( - prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) - - assert prompt_inputs is not None - - sampling_params: Union[SamplingParams, BeamSearchParams] - default_max_tokens = self.max_model_len - len( - prompt_inputs["prompt_token_ids"]) - if request.use_beam_search: - sampling_params = request.to_beam_search_params( - default_max_tokens) - else: - sampling_params = request.to_sampling_params( - default_max_tokens) - - self._log_inputs(request_id, - prompt_inputs, - params=sampling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) - - engine_inputs = TokensPrompt( - prompt_token_ids=prompt_inputs["prompt_token_ids"]) - if mm_data is not None: - engine_inputs["multi_modal_data"] = mm_data - - is_tracing_enabled = (await - self.engine_client.is_tracing_enabled()) - trace_headers = None - if is_tracing_enabled and raw_request: - trace_headers = extract_trace_headers(raw_request.headers) - if (not is_tracing_enabled and raw_request - and contains_trace_headers(raw_request.headers)): - log_tracing_disabled_warning() - - if isinstance(sampling_params, BeamSearchParams): - result_generator = self.engine_client.beam_search( - engine_inputs['prompt_token_ids'], - request_id, - sampling_params, - ) - else: - result_generator = self.engine_client.generate( - engine_inputs, - sampling_params, - request_id, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=request.priority, - ) + for i, engine_prompt in enumerate(engine_prompts): + sampling_params: Union[SamplingParams, BeamSearchParams] + default_max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"]) + if request.use_beam_search: + sampling_params = request.to_beam_search_params( + default_max_tokens) + else: + sampling_params = request.to_sampling_params( + default_max_tokens) + + self._log_inputs(request_id, + request_prompts[i], + params=sampling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + is_tracing_enabled = (await + self.engine_client.is_tracing_enabled()) + trace_headers = None + if is_tracing_enabled and raw_request: + trace_headers = extract_trace_headers(raw_request.headers) + if (not is_tracing_enabled and raw_request + and contains_trace_headers(raw_request.headers)): + log_tracing_disabled_warning() + + if isinstance(sampling_params, BeamSearchParams): + generator = self.engine_client.beam_search( + engine_prompt['prompt_token_ids'], + request_id, + sampling_params, + ) + else: + generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=request.priority, + ) + + generators.append(generator) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + assert len(generators) == 1 + result_generator, = generators + if raw_request: result_generator = iterate_with_cancellation( result_generator, raw_request.is_disconnected) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 7f5a0b2ea51c4..c0be8977e5858 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -95,8 +95,6 @@ async def create_completion( if raw_request: raw_request.state.request_metadata = request_metadata - # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] try: ( lora_request, @@ -105,19 +103,24 @@ async def create_completion( tokenizer = await self.engine_client.get_tokenizer(lora_request) - prompts = list( - self._tokenize_prompt_input_or_inputs( - request, - tokenizer, - request.prompt, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - )) + request_prompts, engine_prompts = self._preprocess_completion( + request, + tokenizer, + request.prompt, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) - for i, prompt_inputs in enumerate(prompts): + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[RequestOutput, None]] = [] + try: + for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] default_max_tokens = self.max_model_len - len( - prompt_inputs["prompt_token_ids"]) + engine_prompt["prompt_token_ids"]) if request.use_beam_search: sampling_params = request.to_beam_search_params( default_max_tokens) @@ -128,7 +131,7 @@ async def create_completion( request_id_item = f"{request_id}-{i}" self._log_inputs(request_id_item, - prompt_inputs, + request_prompts[i], params=sampling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) @@ -144,7 +147,7 @@ async def create_completion( if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( - prompt_inputs["prompt_token_ids"], + engine_prompt["prompt_token_ids"], request_id_item, sampling_params, ) @@ -152,7 +155,7 @@ async def create_completion( generator = self.engine_client.generate( { "prompt_token_ids": - prompt_inputs["prompt_token_ids"] + engine_prompt["prompt_token_ids"] }, sampling_params, request_id_item, @@ -170,6 +173,8 @@ async def create_completion( result_generator = merge_async_iterators( *generators, is_cancelled=raw_request.is_disconnected) + num_prompts = len(engine_prompts) + # Similar to the OpenAI API, when n != best_of, we do not stream the # results. In addition, we do not stream the results when use # beam search. @@ -185,12 +190,12 @@ async def create_completion( request_id, created_time, model_name, - num_prompts=len(prompts), + num_prompts=num_prompts, tokenizer=tokenizer, request_metadata=request_metadata) # Non-streaming response - final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts) + final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res @@ -202,7 +207,7 @@ async def create_completion( # We did not pass it into vLLM engine to avoid being redundant # with the inputs token IDs if final_res.prompt is None: - final_res.prompt = prompts[i]["prompt"] + final_res.prompt = request_prompts[i]["prompt"] final_res_batch_checked = cast(List[RequestOutput], final_res_batch) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 1899bd18b370a..f2761b2079aaa 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -151,7 +151,11 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) if isinstance(request, EmbeddingChatRequest): - request_prompts, engine_prompts = await self._preprocess_chat( + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( request, tokenizer, request.messages, @@ -210,9 +214,11 @@ async def create_embedding( is_cancelled=raw_request.is_disconnected if raw_request else None, ) + num_prompts = len(engine_prompts) + # Non-streaming response final_res_batch: List[Optional[EmbeddingRequestOutput]] - final_res_batch = [None] * len(request_prompts) + final_res_batch = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a9d073fa5d029..3748f8910dcaa 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -11,6 +11,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ConversationMessage, apply_hf_chat_template, apply_mistral_chat_template, parse_chat_messages_futures) @@ -392,7 +393,7 @@ def _preprocess_completion( input_or_inputs: Union[str, List[str], List[int], List[List[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> Tuple[Sequence[RequestPrompt], List[TokensPrompt]]: + ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]: request_prompts = [ request_prompt for request_prompt in self._tokenize_prompt_input_or_inputs( @@ -425,7 +426,8 @@ async def _preprocess_chat( tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = False, - ) -> Tuple[Sequence[RequestPrompt], List[TokensPrompt]]: + ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], + List[TokensPrompt]]: conversation, mm_data_future = parse_chat_messages_futures( messages, self.model_config, @@ -487,7 +489,7 @@ async def _preprocess_chat( if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data - return [request_prompt], [engine_prompt] + return conversation, [request_prompt], [engine_prompt] def _log_inputs( self, diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index b2019b2425acd..ddc80cbaa55a1 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -67,7 +67,11 @@ async def create_tokenize( tokenizer = await self.engine_client.get_tokenizer(lora_request) if isinstance(request, TokenizeChatRequest): - request_prompts, engine_prompts = await self._preprocess_chat( + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( request, tokenizer, request.messages, From 8b950dd0ecd6f3a282a511e2524913e099bdb18b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 13:54:57 +0000 Subject: [PATCH 17/43] Factor out trace headers code --- vllm/entrypoints/openai/serving_chat.py | 12 ++--------- vllm/entrypoints/openai/serving_completion.py | 12 ++--------- vllm/entrypoints/openai/serving_embedding.py | 11 ++++++---- vllm/entrypoints/openai/serving_engine.py | 21 +++++++++++++++++-- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7440a82bd947d..ad7e287501a4e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -29,8 +29,6 @@ from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.tracing import (contains_trace_headers, extract_trace_headers, - log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import iterate_with_cancellation @@ -183,14 +181,8 @@ async def create_chat_completion( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - is_tracing_enabled = (await - self.engine_client.is_tracing_enabled()) - trace_headers = None - if is_tracing_enabled and raw_request: - trace_headers = extract_trace_headers(raw_request.headers) - if (not is_tracing_enabled and raw_request - and contains_trace_headers(raw_request.headers)): - log_tracing_disabled_warning() + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c0be8977e5858..28b0f2c4bd7fd 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -29,8 +29,6 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.tracing import (contains_trace_headers, extract_trace_headers, - log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import merge_async_iterators, random_uuid @@ -136,14 +134,8 @@ async def create_completion( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - is_tracing_enabled = (await - self.engine_client.is_tracing_enabled()) - trace_headers = None - if is_tracing_enabled: - trace_headers = extract_trace_headers(raw_request.headers) - if not is_tracing_enabled and contains_trace_headers( - raw_request.headers): - log_tracing_disabled_warning() + trace_headers = (await + self._get_trace_headers(raw_request.headers)) if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index f2761b2079aaa..d6f0510791352 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -150,6 +150,10 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for embedding models") + if isinstance(request, EmbeddingChatRequest): ( conversation, @@ -191,16 +195,15 @@ async def create_embedding( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - if prompt_adapter_request is not None: - raise NotImplementedError( - "Prompt adapter is not supported " - "for embedding models") + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) generator = self.engine_client.encode( engine_prompt, pooling_params, request_id_item, lora_request=lora_request, + trace_headers=trace_headers, priority=request.priority, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 3748f8910dcaa..cfbbe33864ff4 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -2,10 +2,11 @@ import pathlib from dataclasses import dataclass from http import HTTPStatus -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, - Sequence, Tuple, TypedDict, Union) +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, + Optional, Sequence, Tuple, TypedDict, Union) from pydantic import Field +from starlette.datastructures import Headers from typing_extensions import Annotated from vllm.config import ModelConfig @@ -40,6 +41,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import AtomicCounter, is_list_of @@ -522,6 +525,20 @@ def _log_inputs( prompt_adapter_request=prompt_adapter_request, ) + async def _get_trace_headers( + self, + headers: Headers, + ) -> Optional[Mapping[str, str]]: + is_tracing_enabled = await self.engine_client.is_tracing_enabled() + + if is_tracing_enabled: + return extract_trace_headers(headers) + + if contains_trace_headers(headers): + log_tracing_disabled_warning() + + return None + @staticmethod def _get_decoded_token(logprob: Logprob, token_id: int, From f5e72ffd38a6640db5b9ba7cf3ddc22fdc18d244 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 13:59:38 +0000 Subject: [PATCH 18/43] Clean --- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/entrypoints/openai/serving_tokenization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index d6f0510791352..e1db99ec2f88c 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -156,7 +156,7 @@ async def create_embedding( if isinstance(request, EmbeddingChatRequest): ( - conversation, + _, request_prompts, engine_prompts, ) = await self._preprocess_chat( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index ddc80cbaa55a1..1fd82304f7a4d 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -68,7 +68,7 @@ async def create_tokenize( if isinstance(request, TokenizeChatRequest): ( - conversation, + _, request_prompts, engine_prompts, ) = await self._preprocess_chat( From 9cd1ac3d11c8489d417ef8a6495c77748ff4db2a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 14:04:41 +0000 Subject: [PATCH 19/43] More precise error handling --- vllm/entrypoints/openai/serving_chat.py | 3 +++ vllm/entrypoints/openai/serving_completion.py | 8 ++++++-- vllm/entrypoints/openai/serving_embedding.py | 5 +++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 252f970742495..9551b4f2091dd 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -581,6 +581,9 @@ async def chat_completion_full_generator( final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) assert final_res is not None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index db31b1153d97e..570232be38379 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -189,7 +189,13 @@ async def create_completion( try: async for i, res in result_generator: final_res_batch[i] = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + try: for i, final_res in enumerate(final_res_batch): assert final_res is not None @@ -211,8 +217,6 @@ async def create_completion( tokenizer, request_metadata, ) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e1db99ec2f88c..90cd9145fac1e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -225,7 +225,10 @@ async def create_embedding( try: async for i, res in result_generator: final_res_batch[i] = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + try: for final_res in final_res_batch: assert final_res is not None @@ -235,8 +238,6 @@ async def create_embedding( response = request_output_to_embedding_response( final_res_batch_checked, request_id, created_time, model_name, encoding_format) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) From d775150a9fa1628080f451e2438253743026991e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 14:52:47 +0000 Subject: [PATCH 20/43] Add and update tests --- tests/entrypoints/openai/test_embedding.py | 115 +++++++++++------- .../openai/test_vision_embedding.py | 94 ++++++++++++++ 2 files changed, 162 insertions(+), 47 deletions(-) create mode 100644 tests/entrypoints/openai/test_vision_embedding.py diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index f119c6c1201c9..12016420fb465 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -4,14 +4,15 @@ import openai import pytest import pytest_asyncio +import requests from ...utils import RemoteOpenAIServer -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @pytest.fixture(scope="module") -def embedding_server(): +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -21,29 +22,25 @@ def embedding_server(): "8192", ] - with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest_asyncio.fixture -async def embedding_client(embedding_server): - async with embedding_server.get_async_client() as async_client: +async def client(server): + async with server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -57,7 +54,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, # test using token IDs input_tokens = [1, 1, 1, 1, 1] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -71,18 +68,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): # test List[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -90,11 +83,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 32 + assert embeddings.usage.total_tokens == 32 # test List[List[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -108,22 +104,53 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_embedding(server: RemoteOpenAIServer, + model_name: str): + messages = [{ + "role": "user", + "content": "The cat sat on the mat.", + }, { + "role": "assistant", + "content": "A feline was resting on a rug.", + }, { + "role": "user", + "content": "Stars twinkle brightly in the night sky.", + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embeddings = response.json() + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 32 + assert embeddings.usage.total_tokens == 32 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - responses_float = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="float") + responses_float = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="float") - responses_base64 = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="base64") + responses_base64 = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="base64") decoded_responses_base64_data = [] for data in responses_base64.data: @@ -137,8 +164,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, 1] # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await embedding_client.embeddings.create( - input=input_texts, model=model_name) + responses_default = await client.embeddings.create(input=input_texts, + model=model_name) assert responses_float.data[0].embedding == responses_default.data[ 0].embedding @@ -147,18 +174,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) @@ -173,7 +197,7 @@ async def test_single_embedding_truncation( 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) @@ -187,18 +211,15 @@ async def test_single_embedding_truncation( @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation_invalid( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] with pytest.raises(openai.BadRequestError): - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 8193}) diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py new file mode 100644 index 0000000000000..938fcbcc958e1 --- /dev/null +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -0,0 +1,94 @@ +from typing import Dict + +import pytest +import pytest_asyncio +import requests + +from vllm.multimodal.utils import encode_image_base64, fetch_image + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" +MAXIMUM_IMAGES = 2 + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embedding", + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "5", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"image={MAXIMUM_IMAGES}", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_image() -> Dict[str, str]: + return { + image_url: encode_image_base64(fetch_image(image_url)) + for image_url in TEST_IMAGE_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embeddings = response.json() + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 3072 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 17 + assert embeddings.usage.total_tokens == 17 From f2b5846f6522ac332657429fc158d326cf8a9baf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 14:53:36 +0000 Subject: [PATCH 21/43] Cleanup --- vllm/entrypoints/openai/serving_embedding.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 90cd9145fac1e..779fce3c04869 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -96,12 +96,7 @@ def __init__( prompt_adapters=None, request_logger=request_logger) - # If this is None we use the tokenizer's default chat template - # the list of commonly-used chat template names for HF named templates - hf_chat_templates: List[str] = ['default', 'tool_use'] - self.chat_template = chat_template \ - if chat_template in hf_chat_templates \ - else load_chat_template(chat_template) + self.chat_template = load_chat_template(chat_template) self._enabled = check_embedding_mode(model_config) From 4a2580666eeb63520dcda1bd77fa7cc9e1629d88 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 15:28:00 +0000 Subject: [PATCH 22/43] Fix tests --- tests/entrypoints/openai/test_embedding.py | 27 ++++++++++++++----- .../openai/test_vision_embedding.py | 12 ++++----- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 12016420fb465..255bbf1194749 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -6,9 +6,12 @@ import pytest_asyncio import requests +from vllm.transformers_utils.tokenizer import get_tokenizer + from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @pytest.fixture(scope="module") @@ -20,6 +23,8 @@ def server(): "--enforce-eager", "--max-model-len", "8192", + "--chat-template", + DUMMY_CHAT_TEMPLATE, ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -126,13 +131,23 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, }) response.raise_for_status() + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + prompt = tokenizer.apply_chat_template( + messages, + chat_template=DUMMY_CHAT_TEMPLATE, + add_generation_prompt=True, + continue_final_message=False, + tokenize=False, + ) + tokens = tokenizer.encode(prompt, add_special_tokens=False) + embeddings = response.json() - assert embeddings.id is not None - assert len(embeddings.data) == 3 - assert len(embeddings.data[0].embedding) == 4096 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 32 - assert embeddings.usage.total_tokens == 32 + assert embeddings["id"] is not None + assert len(embeddings["data"]) == 1 + assert len(embeddings["data"][0]["embedding"]) == 4096 + assert embeddings["usage"]["completion_tokens"] == 0 + assert embeddings["usage"]["prompt_tokens"] == len(tokens) + assert embeddings["usage"]["total_tokens"] == len(tokens) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 938fcbcc958e1..73a69da32e434 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -86,9 +86,9 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, response.raise_for_status() embeddings = response.json() - assert embeddings.id is not None - assert len(embeddings.data) == 1 - assert len(embeddings.data[0].embedding) == 3072 - assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 17 - assert embeddings.usage.total_tokens == 17 + assert embeddings["id"] is not None + assert len(embeddings["data"]) == 1 + assert len(embeddings["data"][0]["embedding"]) == 3072 + assert embeddings["usage"]["completion_tokens"] == 0 + assert embeddings["usage"]["prompt_tokens"] == 771 + assert embeddings["usage"]["total_tokens"] == 771 From bbcfc6aef61187cea73f02203b42b198d139bd13 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 15:38:34 +0000 Subject: [PATCH 23/43] Update docs --- docs/source/dev/pooling_params.rst | 5 +++ docs/source/index.rst | 1 + .../serving/openai_compatible_server.md | 41 +++++++++++++++---- vllm/pooling_params.py | 4 +- 4 files changed, 40 insertions(+), 11 deletions(-) create mode 100644 docs/source/dev/pooling_params.rst diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst new file mode 100644 index 0000000000000..334e0287aff09 --- /dev/null +++ b/docs/source/dev/pooling_params.rst @@ -0,0 +1,5 @@ +Pooling Parameters +================== + +.. autoclass:: vllm.PoolingParams + :members: diff --git a/docs/source/index.rst b/docs/source/index.rst index c328c049b430c..2399fcf5faec9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -134,6 +134,7 @@ Documentation :caption: Developer Documentation dev/sampling_params + dev/pooling_params dev/offline_inference/offline_index dev/engine/engine_index dev/kernel/paged_attention diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 3b8708a13086c..e779288520fd5 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -26,13 +26,17 @@ print(completion.choices[0].message) ``` ## API Reference -Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: -- Chat: `tools`, and `tool_choice`. -- Completions: `suffix`. + +Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We currently support the following APIs: + +- Completions API (except for `suffix` parameter) +- Chat Completions API (except for `parallel_tool_calls` and `user` parameter) +- Embeddings API vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst). ## Extra Parameters + vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. @@ -49,7 +53,25 @@ completion = client.chat.completions.create( ) ``` +### Extra Parameters for Completions API +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + ### Extra Parameters for Chat Completions API + The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py @@ -66,21 +88,22 @@ The following extra parameters are supported: :end-before: end-chat-completion-extra-params ``` -### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +### Extra Parameters for Embeddings API + +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params +:start-after: begin-embedding-sampling-params +:end-before: end-embedding-sampling-params ``` The following extra parameters are supported: ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params +:start-after: begin-embedding-extra-params +:end-before: end-embedding-extra-params ``` ## Chat Template diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 7461fb51989c6..2635c0bccd1c4 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -7,7 +7,7 @@ class PoolingParams( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """Pooling parameters for pooling. + """Pooling parameters for embeddings API. Attributes: additional_data: Any additional data needed for pooling. @@ -16,7 +16,7 @@ class PoolingParams( def clone(self) -> "PoolingParams": """Returns a deep copy of the PoolingParams instance.""" - return PoolingParams(additional_data=self.additional_data, ) + return PoolingParams(additional_data=self.additional_data) def __repr__(self) -> str: return (f"PoolingParams(" From b6820b7404388795bf1b4af3b267f9d9566d4c14 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 16:20:21 +0000 Subject: [PATCH 24/43] Add docs --- docs/source/models/vlm.rst | 47 ++++++++++++++++++- .../serving/openai_compatible_server.md | 24 +++++++--- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a47902ab4fc9d..56d5a6ca747ae 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -185,7 +185,7 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 .. important:: - Since OpenAI Vision API is based on `Chat Completions `_ API, + Since OpenAI Vision API is based on `Chat Completions API `_, a chat template is **required** to launch the API server. Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. @@ -253,3 +253,48 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p .. note:: There is no need to format the prompt in the API request since it will be handled by the server. + +vLLM Embedding API +^^^^^^^^^^^^^^^^^^ + +vLLM's Embedding API is a superset of OpenAI's `Embeddings API `_, +where chat conversations can be passed instead of batched inputs. This enables multi-modal inputs to be passed to embedding models. + +In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. + +.. code-block:: bash + + vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 + +.. important:: + + Since ``VLM2Vec`` has the same model architecture as ``Phi-3.5-vision``, we have to explicitly pass ``--task embedding`` + to run this model in embedding mode instead of text generation mode. + +Since this schema is not defined by OpenAI client, we have to post a request to the server using the lower-level ``requests`` library: + +.. code-block:: python + + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + messages = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embedding_json = response.json() + print("Embedding output:", embedding_json["data"][0]["embedding"]) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index e779288520fd5..54c3e40a31fba 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -27,13 +27,23 @@ print(completion.choices[0].message) ## API Reference -Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We currently support the following APIs: - -- Completions API (except for `suffix` parameter) -- Chat Completions API (except for `parallel_tool_calls` and `user` parameter) -- Embeddings API - -vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst). +We currently support the following OpenAI APIs: + +- [Completions API](https://platform.openai.com/docs/api-reference/completions) + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported. + - See more details in [Using VLMs](../models/vlm.rst). + - *Note: `image_url.detail` parameter is not supported.* + - We also support `audio_url` content type for audio files. + - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. + - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) + - Instead of `inputs`, you can alternatively pass in a list of `messages` (same schema as Chat Completions API), + which will be treated as a single prompt to the model according to its chat template. + - This enables multi-modal inputs to be passed to embedding models. See more details in [Using VLMs](../models/vlm.rst). + - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Extra Parameters From fed887ac989ffdf6c2cf08b2e75c5a4ab1580dfe Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 16:25:44 +0000 Subject: [PATCH 25/43] Fix doc failure --- docs/source/serving/openai_compatible_server.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 54c3e40a31fba..1d8efa045ab26 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -104,8 +104,8 @@ The following [pooling parameters (click through to see documentation)](../dev/p ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-embedding-sampling-params -:end-before: end-embedding-sampling-params +:start-after: begin-embedding-pooling-params +:end-before: end-embedding-pooling-params ``` The following extra parameters are supported: From 1774b27d56772b2a72de901e5fcb8ce94cd8c2f6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 16:40:08 +0000 Subject: [PATCH 26/43] Mock out starlette --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 8435129e752e1..8c13419e11df9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -97,6 +97,7 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ "aiohttp", + "starlette", "compressed_tensors", "cpuinfo", "cv2", From c94aa9348919387a4a2e8ab3c3c12e3c0aa61119 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:04:49 +0000 Subject: [PATCH 27/43] Try fix docs --- docs/requirements-docs.txt | 2 ++ docs/source/conf.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index d58f226136918..e3e35844405ac 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -13,5 +13,7 @@ torch py-cpuinfo transformers mistral_common >= 1.3.4 +aiohttp +starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 8c13419e11df9..c7b638473a931 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -96,8 +96,6 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ - "aiohttp", - "starlette", "compressed_tensors", "cpuinfo", "cv2", @@ -144,6 +142,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: "python": ("https://docs.python.org/3", None), "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "aiohttp": ("https://docs.aiohttp.org/en/stable", None), "pillow": ("https://pillow.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable", None), "torch": ("https://pytorch.org/docs/stable", None), From e2ecbcd4da8aaa1bb967d3d596321550489fe016 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:20:53 +0000 Subject: [PATCH 28/43] Cleanup docs --- docs/source/getting_started/quickstart.rst | 2 +- docs/source/models/vlm.rst | 34 +++++++++---------- .../serving/openai_compatible_server.md | 5 ++- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 55b929b4f6fd9..00b762ccc2ccb 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -157,7 +157,7 @@ You can use the `create chat completion `_, +vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, where chat conversations can be passed instead of batched inputs. This enables multi-modal inputs to be passed to embedding models. In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. @@ -280,20 +280,20 @@ Since this schema is not defined by OpenAI client, we have to post a request to image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - messages = [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }] - - response = requests.post(server.url_for("v1/embeddings"), - json={ - "model": model_name, - "messages": messages, - "encoding_format": "float" - }) + response = requests.post( + server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float" + }, + ) response.raise_for_status() embedding_json = response.json() diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1d8efa045ab26..4d96fef81b4a1 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -32,8 +32,7 @@ We currently support the following OpenAI APIs: - [Completions API](https://platform.openai.com/docs/api-reference/completions) - *Note: `suffix` parameter is not supported.* - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported. - - See more details in [Using VLMs](../models/vlm.rst). + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). - *Note: `image_url.detail` parameter is not supported.* - We also support `audio_url` content type for audio files. - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. @@ -42,7 +41,7 @@ We currently support the following OpenAI APIs: - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can alternatively pass in a list of `messages` (same schema as Chat Completions API), which will be treated as a single prompt to the model according to its chat template. - - This enables multi-modal inputs to be passed to embedding models. See more details in [Using VLMs](../models/vlm.rst). + - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Extra Parameters From fbbd8b152bddb2d059e5d81c3c9cea241efbaf3e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:22:43 +0000 Subject: [PATCH 29/43] Fix newlines --- vllm/entrypoints/openai/protocol.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3c66203f4df29..8193982efbf47 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -290,7 +290,6 @@ class ChatCompletionRequest(OpenAIBaseModel): "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " "through out the inference process and return in response.")) - # doc: end-chat-completion-extra-params def to_beam_search_params(self, @@ -588,7 +587,6 @@ class CompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) - # doc: end-completion-extra-params def to_beam_search_params(self, @@ -713,7 +711,6 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): # doc: begin-embedding-pooling-params additional_data: Optional[Any] = None - # doc: end-embedding-pooling-params # doc: begin-embedding-extra-params @@ -729,7 +726,6 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) - # doc: end-embedding-extra-params def to_pooling_params(self): @@ -747,6 +743,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): # doc: begin-chat-embedding-pooling-params additional_data: Optional[Any] = None + # doc: end-chat-embedding-pooling-params # doc: begin-chat-embedding-extra-params add_generation_prompt: bool = Field( @@ -793,7 +790,6 @@ class EmbeddingChatRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) - # doc: end-chat-embedding-extra-params @model_validator(mode="before") From 50ad3aae7646550550eef41a9ac95948ae9c048f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:23:52 +0000 Subject: [PATCH 30/43] Reword --- docs/source/serving/openai_compatible_server.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 4d96fef81b4a1..44561f10ffe0f 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -39,7 +39,7 @@ We currently support the following OpenAI APIs: - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - - Instead of `inputs`, you can alternatively pass in a list of `messages` (same schema as Chat Completions API), + - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), which will be treated as a single prompt to the model according to its chat template. - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* From 9c1df21720ee77fb229228aacdb77dfc296e2b54 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:27:45 +0000 Subject: [PATCH 31/43] Fix --- docs/source/models/vlm.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 95f80ec11d00a..2d18de71d53f1 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -258,7 +258,8 @@ Chat Embeddings API ^^^^^^^^^^^^^^^^^^^ vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, -where chat conversations can be passed instead of batched inputs. This enables multi-modal inputs to be passed to embedding models. +where a list of ``messages`` can be passed instead of batched ``inputs``. The format of ``messages`` is the same as in Chat Completions API. +This enables multi-modal inputs to be passed to embedding models. In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. @@ -281,7 +282,7 @@ Since this schema is not defined by OpenAI client, we have to post a request to image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" response = requests.post( - server.url_for("v1/embeddings"), + "http://localhost:8000/v1/embeddings", json={ "model": model_name, "messages": [{ From 8049030088ec09441a785b47f797aefa9e7019eb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:29:12 +0000 Subject: [PATCH 32/43] Update --- docs/source/models/vlm.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 2d18de71d53f1..4dfac8b842057 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -258,8 +258,10 @@ Chat Embeddings API ^^^^^^^^^^^^^^^^^^^ vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of ``messages`` can be passed instead of batched ``inputs``. The format of ``messages`` is the same as in Chat Completions API. -This enables multi-modal inputs to be passed to embedding models. +where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. + +.. note:: + The schema of ``messages`` is exactly the same as in Chat Completions API. In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. @@ -273,7 +275,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. Since ``VLM2Vec`` has the same model architecture as ``Phi-3.5-vision``, we have to explicitly pass ``--task embedding`` to run this model in embedding mode instead of text generation mode. -Since this schema is not defined by OpenAI client, we have to post a request to the server using the lower-level ``requests`` library: +Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: .. code-block:: python From a38784546b745f5210c1c37aabe523bead0aa487 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:30:36 +0000 Subject: [PATCH 33/43] Update --- docs/source/models/vlm.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 4dfac8b842057..7888d9eb0082f 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -272,7 +272,7 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. .. important:: - Since ``VLM2Vec`` has the same model architecture as ``Phi-3.5-vision``, we have to explicitly pass ``--task embedding`` + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` to run this model in embedding mode instead of text generation mode. Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: @@ -286,7 +286,7 @@ Since this schema is not defined by OpenAI client, we post a request to the serv response = requests.post( "http://localhost:8000/v1/embeddings", json={ - "model": model_name, + "model": "TIGER-Lab/VLM2Vec-Full", "messages": [{ "role": "user", "content": [ From d80ec7eb49996a83d00053dbbc62affa0638d846 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:31:15 +0000 Subject: [PATCH 34/43] Update --- docs/source/models/vlm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 7888d9eb0082f..1040e5a2f5957 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -294,7 +294,7 @@ Since this schema is not defined by OpenAI client, we post a request to the serv {"type": "text", "text": "Represent the given image."}, ], }], - "encoding_format": "float" + "encoding_format": "float", }, ) response.raise_for_status() From ea5fd9638f9884feceef421f0585fc81239b9b08 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:33:19 +0000 Subject: [PATCH 35/43] format --- vllm/entrypoints/openai/protocol.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8193982efbf47..9709efff66570 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -290,6 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " "through out the inference process and return in response.")) + # doc: end-chat-completion-extra-params def to_beam_search_params(self, @@ -587,6 +588,7 @@ class CompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) + # doc: end-completion-extra-params def to_beam_search_params(self, @@ -726,6 +728,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) + # doc: end-embedding-extra-params def to_pooling_params(self): From b05ede60d248b13e0c89d20d54386497d4dd1e35 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:47:01 +0000 Subject: [PATCH 36/43] Convert to tip --- docs/source/models/vlm.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 1040e5a2f5957..342fa95ba4553 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -243,6 +243,9 @@ To consume the server, you can use the OpenAI client like in the example below: A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. +.. tip:: + There is no need to format the prompt in the API request since it will be handled by the server. + .. note:: By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: @@ -251,16 +254,13 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p $ export VLLM_IMAGE_FETCH_TIMEOUT= -.. note:: - There is no need to format the prompt in the API request since it will be handled by the server. - Chat Embeddings API ^^^^^^^^^^^^^^^^^^^ vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. -.. note:: +.. tip:: The schema of ``messages`` is exactly the same as in Chat Completions API. In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. From dba9806cebd4f866eb437904e09b050f371812fe Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 29 Oct 2024 17:48:47 +0000 Subject: [PATCH 37/43] newline --- docs/source/serving/openai_compatible_server.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 44561f10ffe0f..7f54ea8d10864 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -63,6 +63,7 @@ completion = client.chat.completions.create( ``` ### Extra Parameters for Completions API + The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py From 557c9efb0952aac161da38ee77be6541381152d2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 30 Oct 2024 02:53:27 +0000 Subject: [PATCH 38/43] Fix missing client --- tests/entrypoints/openai/test_metrics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6f3808b6db142..b3f1fea91d13e 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -4,6 +4,7 @@ import time from http import HTTPStatus +import openai import pytest import pytest_asyncio import requests @@ -78,7 +79,8 @@ async def client(server): @pytest.mark.asyncio -async def test_metrics_counts(server: RemoteOpenAIServer): +async def test_metrics_counts(server: RemoteOpenAIServer, + client: openai.AsyncClient): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -167,7 +169,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer): @pytest.mark.asyncio -async def test_metrics_exist(server: RemoteOpenAIServer): +async def test_metrics_exist(server: RemoteOpenAIServer, + client: openai.AsyncClient): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", From 46f316f1c16473ad33848783fb68a82cf691ffbb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 04:14:39 +0000 Subject: [PATCH 39/43] Optionally initialize request handlers --- vllm/entrypoints/openai/api_server.py | 95 ++++++++++++-------- vllm/entrypoints/openai/run_batch.py | 33 +++++-- vllm/entrypoints/openai/serving_embedding.py | 15 ---- 3 files changed, 82 insertions(+), 61 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a51d030016fa2..95fd56d916050 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,7 +11,7 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Set +from typing import AsyncIterator, Optional, Set import uvloop from fastapi import APIRouter, FastAPI, Request @@ -51,7 +51,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -248,20 +248,25 @@ def mount_metrics(app: FastAPI): app.routes.append(metrics_route) -def chat(request: Request) -> OpenAIServingChat: +def base(request: Request) -> OpenAIServing: + # Reuse the existing instance + return tokenization(request) + + +def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat -def completion(request: Request) -> OpenAIServingCompletion: +def completion(request: Request) -> Optional[OpenAIServingCompletion]: return request.app.state.openai_serving_completion -def tokenization(request: Request) -> OpenAIServingTokenization: - return request.app.state.openai_serving_tokenization +def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: + return request.app.state.openai_serving_embedding -def embedding(request: Request) -> OpenAIServingEmbedding: - return request.app.state.openai_serving_embedding +def tokenization(request: Request) -> OpenAIServingTokenization: + return request.app.state.openai_serving_tokenization def engine_client(request: Request) -> EngineClient: @@ -277,7 +282,9 @@ async def health(raw_request: Request) -> Response: @router.post("/tokenize") async def tokenize(request: TokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_tokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_tokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -289,7 +296,9 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): @router.post("/detokenize") async def detokenize(request: DetokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_detokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_detokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -301,7 +310,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - models = await completion(raw_request).show_available_models() + handler = base(raw_request) + + models = await handler.show_available_models() return JSONResponse(content=models.model_dump()) @@ -314,9 +325,12 @@ async def show_version(): @router.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API") - generator = await chat(raw_request).create_chat_completion( - request, raw_request) + generator = await handler.create_chat_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -330,8 +344,12 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): - generator = await completion(raw_request).create_completion( - request, raw_request) + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API") + + generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -343,8 +361,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request): @router.post("/v1/embeddings") async def create_embedding(request: EmbeddingRequest, raw_request: Request): - generator = await embedding(raw_request).create_embedding( - request, raw_request) + handler = embedding(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") + + generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -382,30 +404,26 @@ async def stop_profile(raw_request: Request): @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -501,7 +519,8 @@ def init_app_state( chat_template=args.chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, - tool_parser=args.tool_call_parser) + tool_parser=args.tool_call_parser, + ) if model_config.task == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, @@ -510,14 +529,14 @@ def init_app_state( prompt_adapters=args.prompt_adapters, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) + ) if model_config.task == "generate" else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, base_model_paths, request_logger=request_logger, chat_template=args.chat_template, - ) + ) if model_config.task == "embedding" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 41b9d92f1166d..a64467a311523 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -217,14 +217,14 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, - ) + ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, base_model_paths, request_logger=request_logger, chat_template=None, - ) + ) if model_config.task == "embedding" else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) @@ -241,14 +241,31 @@ async def main(args): # Determine the type of request and run it. if request.url == "/v1/chat/completions": - response_futures.append( - run_request(openai_serving_chat.create_chat_completion, - request, tracker)) + handler_fn = (None if openai_serving_chat is None else + openai_serving_chat.create_chat_completion) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg= + "The model does not support Chat Completions API", + )) + continue + + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() elif request.url == "/v1/embeddings": - response_futures.append( - run_request(openai_serving_embedding.create_embedding, request, - tracker)) + handler_fn = (None if openai_serving_embedding is None else + openai_serving_embedding.create_embedding) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Embeddings API", + )) + continue + + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() else: response_futures.append( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 779fce3c04869..917856cd2b2dd 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -24,17 +24,6 @@ logger = init_logger(__name__) -def check_embedding_mode(model_config: ModelConfig) -> bool: - embedding_mode = model_config.task == "embedding" - - if not embedding_mode: - logger.warning("embedding_mode is False. Embedding API will not work.") - else: - logger.info("Activating the server engine with embedding enabled.") - - return embedding_mode - - def _get_embedding( output: EmbeddingOutput, encoding_format: Literal["float", "base64"], @@ -98,8 +87,6 @@ def __init__( self.chat_template = load_chat_template(chat_template) - self._enabled = check_embedding_mode(model_config) - async def create_embedding( self, request: EmbeddingRequest, @@ -111,8 +98,6 @@ async def create_embedding( See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. """ - if not self._enabled: - return self.create_error_response("Embedding API disabled") error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret From 1179f66a3a2e7ee73dc5a94cb5dfb7be72b57db1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 04:17:10 +0000 Subject: [PATCH 40/43] Update tip --- docs/source/models/vlm.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 342fa95ba4553..9290837595578 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -244,7 +244,8 @@ To consume the server, you can use the OpenAI client like in the example below: A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. .. tip:: - There is no need to format the prompt in the API request since it will be handled by the server. + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. .. note:: From eb4b235911ea872c93feb74e55f566261f011c48 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 05:54:58 +0000 Subject: [PATCH 41/43] Update tests --- tests/entrypoints/openai/test_embedding.py | 38 +++++++++++++--------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 255bbf1194749..e2a883135c741 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -111,6 +111,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_conversation_embedding(server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "user", @@ -123,13 +124,14 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, "content": "Stars twinkle brightly in the night sky.", }] - response = requests.post(server.url_for("v1/embeddings"), - json={ - "model": model_name, - "messages": messages, - "encoding_format": "float" - }) - response.raise_for_status() + chat_response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }) + chat_response.raise_for_status() + chat_embeddings = chat_response.json() tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") prompt = tokenizer.apply_chat_template( @@ -139,15 +141,19 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, continue_final_message=False, tokenize=False, ) - tokens = tokenizer.encode(prompt, add_special_tokens=False) - - embeddings = response.json() - assert embeddings["id"] is not None - assert len(embeddings["data"]) == 1 - assert len(embeddings["data"][0]["embedding"]) == 4096 - assert embeddings["usage"]["completion_tokens"] == 0 - assert embeddings["usage"]["prompt_tokens"] == len(tokens) - assert embeddings["usage"]["total_tokens"] == len(tokens) + completion_response = await client.embeddings.create( + model=model_name, + input=prompt, + encoding_format="float", + # To be consistent with chat + extra_body={"add_special_tokens": False}, + ) + completion_embeddings = completion_response.model_dump(mode="json") + + assert chat_embeddings.pop("id") is not None + assert completion_embeddings.pop("id") is not None + assert chat_embeddings.pop("created") <= completion_embeddings.pop("created") + assert chat_embeddings == completion_embeddings @pytest.mark.asyncio From bf46a16125842eefbd3194d59ab2bb8ed83d783a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 05:55:38 +0000 Subject: [PATCH 42/43] format --- tests/entrypoints/openai/test_embedding.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index e2a883135c741..9f2b77dde2a7f 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -126,9 +126,9 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, chat_response = requests.post(server.url_for("v1/embeddings"), json={ - "model": model_name, - "messages": messages, - "encoding_format": "float", + "model": model_name, + "messages": messages, + "encoding_format": "float", }) chat_response.raise_for_status() chat_embeddings = chat_response.json() @@ -152,7 +152,8 @@ async def test_conversation_embedding(server: RemoteOpenAIServer, assert chat_embeddings.pop("id") is not None assert completion_embeddings.pop("id") is not None - assert chat_embeddings.pop("created") <= completion_embeddings.pop("created") + assert chat_embeddings.pop("created") <= completion_embeddings.pop( + "created") assert chat_embeddings == completion_embeddings From 7f188f9da7b4b4785a7c36577bd056ab63a3adf6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 1 Nov 2024 05:56:35 +0000 Subject: [PATCH 43/43] Rename --- docs/source/models/vlm.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 9290837595578..ac6405b9807a8 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -299,6 +299,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv }, ) response.raise_for_status() - - embedding_json = response.json() - print("Embedding output:", embedding_json["data"][0]["embedding"]) + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"])