diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 26b95ad05333..2b0654fa6d46 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs. ```python from vllm import LLM -llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", - tensor_parallel_size=2) +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` !!! warning @@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option). ```python from vllm import LLM -llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2) +llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) ``` ## Reduce CUDA Graphs @@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag: ```python from vllm import LLM -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True) +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) ``` ## Adjust cache size @@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem from vllm import LLM # Accept up to 3 images and 1 video per prompt -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 3, "video": 1}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}, +) ``` You can go a step further and disable unused modalities completely by setting its limit to zero. @@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a from vllm import LLM # Accept any number of images but no videos -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"video": 0}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}, +) ``` You can even run a multi-modal model for text-only inference: @@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference: from vllm import LLM # Don't accept images. Just text. -llm = LLM(model="google/gemma-3-27b-it", - limit_mm_per_prompt={"image": 0}) +llm = LLM( + model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}, +) ``` ### Configurable options @@ -173,14 +175,14 @@ Here are some examples: from vllm import LLM # Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 +) # Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) +llm = LLM( + model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 +) ``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 5c74610ebd29..24c1efa61f28 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -100,7 +100,7 @@ from vllm import LLM llm = LLM( model="meta-llama/Llama-3.3-70B-Instruct, tensor_parallel_size=4, - pipeline_parallel_size=2 + pipeline_parallel_size=2, ) ``` @@ -257,18 +257,24 @@ Examples: ```python # Use a larger cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=8, +) # Use a shared-memory based IPC cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - tensor_parallel_size=2, - mm_processor_cache_type="shm", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=2, + mm_processor_cache_type="shm", + mm_processor_cache_gb=8, +) # Disable the cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=0) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=0, +) ``` ### Cache Placement diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index aafdb1058e03..a423f4e68337 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -73,8 +73,8 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, ) -> torch.Tensor: ... ``` diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 724dc2284e28..721081dffb49 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -16,7 +16,7 @@ Further update the model as follows: ... @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): return "" @@ -45,14 +45,14 @@ Further update the model as follows: ... def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - assert self.vision_encoder is not None image_features = self.vision_encoder(image_input) return self.multi_modal_projector(image_features) def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - + self, + **kwargs: object, + ) -> MultiModalEmbeddings | None: # Validate the multimodal input keyword arguments image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: @@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m For example, if the model supports any number of images but only one video per prompt: ```python -def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: +def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "video": 1} ``` @@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, + mm_options: Mapping[str, BaseDummyOptions] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) @@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() - return ImageSize(width=image_processor.size["width"], - height=image_processor.size["height"]) + return ImageSize( + width=image_processor.size["width"], + height=image_processor.size["height"], + ) ``` Fuyu does not expect image placeholders in the inputs to HF processor, so @@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return { "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images, - overrides=image_overrides) + self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ) } ``` @@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -810,9 +812,11 @@ to register them to the multi-modal registry: from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY -+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, -+ info=YourProcessingInfo, -+ dummy_inputs=YourDummyInputsBuilder) ++ @MULTIMODAL_REGISTRY.register_processor( ++ YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder, ++ ) class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index 35f35ffa4cde..3bb4f961ef15 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -42,7 +42,7 @@ def register(): ModelRegistry.register_model( "YourModelForCausalLM", - "your_code:YourModelForCausalLM" + "your_code:YourModelForCausalLM", ) ``` diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 4ce748ce1fed..59f14a5ea27b 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -15,6 +15,7 @@ Declare supported languages and capabilities: - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). ??? code "supported_languages and supports_transcription_only" + ```python from typing import ClassVar, Mapping, Literal import numpy as np @@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor This is for controlling general behavior of the API when serving your model: ??? code "get_speech_to_text_config()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface If your model requires a language and you want a default, override this method (see Whisper): ??? code "validate_language()" + ```python @classmethod def validate_language(cls, language: str | None) -> str | None: if language is None: logger.warning( - "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") + "Defaulting to language='en'. If you wish to transcribe " + "audio in a different language, pass the `language` field " + "in the TranscriptionRequest." + ) language = "en" return super().validate_language(language) ``` @@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo Provide a fast duration→token estimate to improve streaming usage statistics: ??? code "get_num_audio_tokens()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi Relevant server logic: ??? code "_preprocess_speech_to_text()" + ```python # vllm/entrypoints/openai/speech_to_text.py async def _preprocess_speech_to_text(...): diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index 1f233c3204a1..960347d9525c 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference ??? console "Command" - ```python + ```bash curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -H 'Content-Type: application/json' \ -H 'Authorization: ' \ @@ -81,7 +81,7 @@ You should get a response like: ??? console "Response" - ```python + ```json { "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "result": { diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index fe4d87f78f2a..9d2c7f5bb565 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: client = OpenAI( base_url="https://gateway.", - api_key="" + api_key="", ) completion = client.chat.completions.create( @@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: "role": "user", "content": "Compose a poem that explains the concept of recursion in programming.", } - ] + ], ) print(completion.choices[0].message.content) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 836305cf15c4..b53b829d6d3c 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -34,7 +34,7 @@ pip install vllm haystack-ai api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), model="mistralai/Mistral-7B-Instruct-v0.1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", - generation_kwargs = {"max_tokens": 512} + generation_kwargs={"max_tokens": 512}, ) response = generator.run( diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md index 75a234bdf142..d39bb9a899c8 100644 --- a/docs/deployment/frameworks/hf_inference_endpoints.md +++ b/docs/deployment/frameworks/hf_inference_endpoints.md @@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "HuggingFaceTB/SmolLM3-3B", - messages = [ + model="HuggingFaceTB/SmolLM3-3B", + messages=[ { "role": "user", "content": [ { "type": "text", - "text": "Give me a brief explanation of gravity in simple terms." + "text": "Give me a brief explanation of gravity in simple terms.", } - ] + ], } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note @@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "ibm-granite/granite-docling-258M", - messages = [ + model="ibm-granite/granite-docling-258M", + messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png" - } + "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png", + }, }, { "type": "text", - "text": "Convert this page to docling." - } + "text": "Convert this page to docling.", + }, ] } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index 0d6c3729911a..9ea7c0373d2a 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -36,15 +36,16 @@ pip install vllm litellm ```python import litellm - messages = [{ "content": "Hello, how are you?","role": "user"}] + messages = [{"content": "Hello, how are you?", "role": "user"}] # hosted_vllm is prefix key word and necessary response = litellm.completion( - model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name - messages=messages, - api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", - temperature=0.2, - max_tokens=80) + model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name + messages=messages, + api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", + temperature=0.2, + max_tokens=80, + ) print(response) ``` diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index d86ab1600f12..37f90ef08f32 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -40,7 +40,7 @@ pip install -U vllm \ 1. Run the script - ```python + ```bash python retrieval_augmented_generation_with_langchain.py ``` @@ -78,6 +78,6 @@ pip install vllm \ 1. Run the script: - ```python + ```bash python retrieval_augmented_generation_with_llamaindex.py ``` diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index f88a29f6eadd..315746b0ef67 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -106,9 +106,11 @@ The dispatch code looks like: batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) # execution -with set_forward_context(..., - cudagraph_runtime_mode=runtime_mode, - batch_descriptor=batch_descriptor): +with set_forward_context( + ..., + cudagraph_runtime_mode=runtime_mode, + batch_descriptor=batch_descriptor, +): output = self.model(...) ``` @@ -202,10 +204,10 @@ from vllm.config import CUDAGraphMode compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - dtype='auto', - compilation_config = compilation_config, - ) + model="meta-llama/Llama-3.1-8B-Instruct", + dtype="auto", + compilation_config=compilation_config, +) sampling_params = vllm.SamplingParams( temperature=0, # greedy decoding max_tokens=1024, diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index e70ee4a076e5..682fc5c413e2 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin IO Processor plugins implement the `IOProcessor` interface (): ```python -IOProcessorInput = TypeVar('IOProcessorInput') -IOProcessorOutput = TypeVar('IOProcessorOutput') +IOProcessorInput = TypeVar("IOProcessorInput") +IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): def pre_process( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: raise NotImplementedError async def pre_process_async( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: return self.pre_process(prompt, request_id, **kwargs) @abstractmethod - def post_process(self, - model_output: Sequence[PoolingRequestOutput], - request_id: Optional[str] = None, - **kwargs) -> IOProcessorOutput: + def post_process( + self, + model_output: Sequence[PoolingRequestOutput], + request_id: str | None = None, + **kwargs, + ) -> IOProcessorOutput: raise NotImplementedError async def post_process_async( self, model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, ) -> IOProcessorOutput: collected_output = [item async for i, item in model_output] @@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @abstractmethod def output_to_response( - self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: + self, plugin_output: IOProcessorOutput + ) -> IOProcessorResponse: raise NotImplementedError ``` diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 90b2fd32f297..c4a2d72a2f4a 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -478,15 +478,17 @@ us with: ```python if seq_group.is_finished(): - if (seq_group.metrics.first_scheduled_time is not None and - seq_group.metrics.first_token_time is not None): + if ( + seq_group.metrics.first_scheduled_time is not None + and seq_group.metrics.first_token_time is not None + ): time_queue_requests.append( seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) + seq_group.metrics.arrival_time + ) ... if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) + time_in_queue_requests.append(seq_group.metrics.time_in_queue) ``` This seems duplicative, and one of them should be removed. The latter diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 9941837bf165..270699df623e 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -112,8 +112,8 @@ class KVCacheBlock: ref_cnt: int # The pointers to form a doubly linked list for the free queue. - prev_free_block: Optional["KVCacheBlock"] = None - next_free_block: Optional["KVCacheBlock"] = None + prev_free_block: "KVCacheBlock | None" = None + next_free_block: "KVCacheBlock | None" = None ``` There are two design points to highlight: diff --git a/docs/features/lora.md b/docs/features/lora.md index db794b2ebd71..d3b44520a5a7 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter. sampling_params = SamplingParams( temperature=0, max_tokens=256, - stop=["[/assistant]"] + stop=["[/assistant]"], ) prompts = [ @@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter. outputs = llm.generate( prompts, sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path), ) ``` @@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin: lora_request = LoRARequest( lora_name=lora_name, lora_path=local_path, - lora_int_id=abs(hash(lora_name)) + lora_int_id=abs(hash(lora_name)), ) return lora_request ``` @@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au if has_audio: question = f"<|audio|>{question}" chat = [ - { - "role": "user", - "content": question - } + {"role": "user", "content": question}, ] return tokenizer.apply_chat_template(chat, tokenize=False) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index dcc5ea3b9096..8f75f714d4b0 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis outputs = llm.generate({ "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, + "multi_modal_data": {"image": [image1, image2]}, }) for o in outputs: @@ -183,21 +181,24 @@ conversation = [ {"role": "assistant", "content": "Hello! How can I assist you today?"}, { "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - },{ - "type": "image_pil", - "image_pil": image_pil - }, { - "type": "image_embeds", - "image_embeds": image_embeds - }, { - "type": "text", - "text": "What's in these images?" - }], + "content": [ + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "image_pil", + "image_pil": image_pil, + }, + { + "type": "image_embeds", + "image_embeds": image_embeds, + }, + { + "type": "text", + "text": "What's in these images?", + }, + ], }, ] @@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with message = { "role": "user", "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + { + "type": "text", + "text": "Describe this set of frames. Consider the frames to be a part of the same video.", + }, ], } for i in range(len(video_frames)): @@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f # Custom black background for dark theme llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}, ) # Custom brand color background (e.g., blue) llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}, ) ``` @@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown limit_mm_per_prompt={"video": 1}, ) - sampling_params = SamplingParams( - max_tokens=1024, - ) + sampling_params = SamplingParams(max_tokens=1024) video_messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + { + "role": "user", + "content": [ {"type": "text", "text": "describe this video."}, { "type": "video", "video": video_path, "total_pixels": 20480 * 28 * 28, - "min_pixels": 16 * 28 * 28 - } + "min_pixels": 16 * 28 * 28, + }, ] }, ] @@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - { - "type": "image_url", - "image_url": { - url": image_url + messages=[ + { + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + { + "type": "text", + "text": "What’s in this image?", }, - "uuid": image_url # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_url, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) @@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - { - "type": "image_url", - "image_url": { - "url": image_url_duck + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the animals in these images?", }, - "uuid": image_url_duck # Optional - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion + { + "type": "image_url", + "image_url": {"url": image_url_duck}, + "uuid": image_url_duck, # Optional }, - "uuid": image_url_lion # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url_lion}, + "uuid": image_url_lion, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) ``` @@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows: ## Use video url in the payload chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this video?", }, - "uuid": video_url # Optional - }, - ], - }], + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows: audio_base64 = encode_base64_content_from_url(audio_url) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav", + }, + "uuid": audio_url, # Optional + }, + ], + }, + ], model=model, max_completion_tokens=64, ) @@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag ```python chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "audio_url", + "audio_url": {"url": audio_url}, + "uuid": audio_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se # Basic usage - this is equivalent to the LLaVA example for offline inference model = "llava-hf/llava-1.5-7b-hf" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": f"{base64_image_embedding}", - "uuid": image_url # Optional + "uuid": image_url, # Optional } # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) model = "Qwen/Qwen2-VL-2B-Instruct" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct + "image_embeds": f"{base64_image_embedding}", # Required + "image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } model = "openbmb/MiniCPM-V-2_6" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 + "image_embeds": f"{base64_image_embedding}", # Required + "image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6 }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } chat_completion = client.chat.completions.create( messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ { - "type": "text", - "text": "What's in this image?", + "role": "system", + "content": "You are a helpful assistant.", }, - embeds, - ], - }, - ], + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?", + }, + embeds, + ], + }, + ], model=model, ) ``` @@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit { "type": "image_embeds", "image_embeds": None, - "uuid": image_uuid + "uuid": image_uuid, }, # input_audio: { "type": "input_audio", "input_audio": None, - "uuid": audio_uuid + "uuid": audio_uuid, }, # PIL Image: { "type": "image_pil", - "image_pil": None - "uuid": image_uuid - } + "image_pil": None, + "uuid": image_uuid, + }, ``` diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 389b3cb21ef5..ab04a1efcc08 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For Qwen3 series, if you want to disable thinking in reasoning mode, add: # extra_body={"chat_template_kwargs": {"enable_thinking": False}} - stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) + stream = client.chat.completions.create( + model=model, + messages=messages, + stream=True, + ) print("client: Start streaming chat completions...") printed_reasoning_content = False @@ -159,27 +161,29 @@ The reasoning content is also available when both tool calling and the reasoning client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["location", "unit"] - } + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location", "unit"], + } + }, } - }] + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) print(response) @@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """ Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and @@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to tuple[Optional[str], Optional[str]]: + self, + model_output: str, + request: ChatCompletionRequest | ResponsesRequest, + ) -> tuple[str | None, str | None]: """ Extract reasoning content from a complete model-generated string. @@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner @classmethod def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: - return cls(start_token_id=tokenizer.encode( - "", add_special_tokens=False)[0], - end_token_id=tokenizer.encode("", - add_special_tokens=False)[0]) + return cls( + start_token_id=tokenizer.encode("", add_special_tokens=False)[0], + end_token_id=tokenizer.encode("", add_special_tokens=False)[0], + ) def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.end_token_id in input_ids diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index e57a8945971f..02a700c09d39 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools: return f"Getting the weather for {location} in {unit}..." tool_functions = {"get_weather": get_weather} - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"], }, - "required": ["location", "unit"] - } - } - }] + }, + }, + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) tool_call = response.choices[0].message.tool_calls[0].function @@ -402,8 +404,7 @@ Here is a summary of a plugin file: # adjust request. e.g.: set skip special tokens # to False for tool call output. - def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: return request # implement the tool call parse for stream call @@ -416,7 +417,7 @@ Here is a summary of a plugin file: current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: return delta # implement the tool parse for non-stream call