vllm-project
diff --git a/‎docs/configuration/conserving_memory.md‎
Lines changed: 23 additions & 21 deletions b/‎docs/configuration/conserving_memory.md‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎docs/configuration/optimization.md‎
Lines changed: 15 additions & 9 deletions b/‎docs/configuration/optimization.md‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎docs/contributing/model/basic.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/contributing/model/basic.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/contributing/model/multimodal.md‎
Lines changed: 23 additions & 19 deletions b/‎docs/contributing/model/multimodal.md‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎docs/contributing/model/registration.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/contributing/model/registration.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/contributing/model/transcription.md‎
Lines changed: 11 additions & 1 deletion b/‎docs/contributing/model/transcription.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎docs/deployment/frameworks/cerebrium.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/deployment/frameworks/cerebrium.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/deployment/frameworks/dstack.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/deployment/frameworks/dstack.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/deployment/frameworks/haystack.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/deployment/frameworks/haystack.md‎
Lines changed: 1 addition & 1 deletion
@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
 ```python
 from vllm import LLM
 
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
-          tensor_parallel_size=2)
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```
 
 !!! warning
@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
 ```python
 from vllm import LLM
 
-llm = LLM(model="adept/fuyu-8b",
-          max_model_len=2048,
-          max_num_seqs=2)
+llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 ```
 
 ## Reduce CUDA Graphs
@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
 
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
-          enforce_eager=True)
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
 ```
 
 ## Adjust cache size
@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
 from vllm import LLM
 
 # Accept up to 3 images and 1 video per prompt
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"image": 3, "video": 1})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 3, "video": 1},
+)
 ```
 
 You can go a step further and disable unused modalities completely by setting its limit to zero.
@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
 from vllm import LLM
 
 # Accept any number of images but no videos
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"video": 0})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"video": 0},
+)
 ```
 
 You can even run a multi-modal model for text-only inference:
@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
 from vllm import LLM
 
 # Don't accept images. Just text.
-llm = LLM(model="google/gemma-3-27b-it",
-          limit_mm_per_prompt={"image": 0})
+llm = LLM(
+    model="google/gemma-3-27b-it",
+    limit_mm_per_prompt={"image": 0},
+)
 ```
 
 ### Configurable options
@@ -173,14 +175,14 @@ Here are some examples:
 from vllm import LLM
 
 # Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_kwargs={"max_pixels": 768 * 768},  # Default is 1280 * 28 * 28
+)
 
 # Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
+llm = LLM(
+    model="OpenGVLab/InternVL2-2B",
+    mm_processor_kwargs={"max_dynamic_patch": 4},  # Default is 12
+)
 ```
@@ -100,7 +100,7 @@ from vllm import LLM
 llm = LLM(
     model="meta-llama/Llama-3.3-70B-Instruct,
     tensor_parallel_size=4,
-    pipeline_parallel_size=2
+    pipeline_parallel_size=2,
 )
 ```
 
@@ -257,18 +257,24 @@ Examples:
 
 ```python
 # Use a larger cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=8,
+)
 
 # Use a shared-memory based IPC cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          tensor_parallel_size=2,
-          mm_processor_cache_type="shm",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    tensor_parallel_size=2,
+    mm_processor_cache_type="shm",
+    mm_processor_cache_gb=8,
+)
 
 # Disable the cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=0)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=0,
+)
 ```
 
 ### Cache Placement
 
@@ -73,8 +73,8 @@ def forward(
     self,
     input_ids: torch.Tensor,
     positions: torch.Tensor,
-    intermediate_tensors: Optional[IntermediateTensors] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
 ) -> torch.Tensor:
     ...
 ```
 
@@ -16,7 +16,7 @@ Further update the model as follows:
             ...
 
             @classmethod
-            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            def get_placeholder_str(cls, modality: str, i: int) -> str | None:
                 if modality.startswith("image"):
                     return "<image>"
 
@@ -45,14 +45,14 @@ Further update the model as follows:
             ...
 
             def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-
                 assert self.vision_encoder is not None
                 image_features = self.vision_encoder(image_input)
                 return self.multi_modal_projector(image_features)
 
             def get_multimodal_embeddings(
-                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+                self,
+                **kwargs: object,
+            ) -> MultiModalEmbeddings | None:
                 # Validate the multimodal input keyword arguments
                 image_input = self._parse_and_validate_image_input(**kwargs)
                 if image_input is None:
@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
 For example, if the model supports any number of images but only one video per prompt:
 
 ```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+def get_supported_mm_limits(self) -> Mapping[str, int | None]:
     return {"image": None, "video": 1}
 ```
 
@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
             self,
             seq_len: int,
             mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+            mm_options: Mapping[str, BaseDummyOptions] | None = None,
         ) -> MultiModalDataDict:
             num_images = mm_counts.get("image", 0)
 
@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     ```python
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        return ImageSize(width=image_processor.size["width"],
-                            height=image_processor.size["height"])
+        return ImageSize(
+            width=image_processor.size["width"],
+            height=image_processor.size["height"],
+        )
     ```
 
     Fuyu does not expect image placeholders in the inputs to HF processor, so
@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
             return {
                 "image":
-                self._get_dummy_images(width=target_width,
-                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                self._get_dummy_images(
+                    width=target_width,
+                    height=target_height,
+                    num_images=num_images,
+                    overrides=image_overrides,
+                )
             }
         ```
 
@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                            [_NEWLINE_TOKEN_ID]) * nrows
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
             return PromptUpdateDetails.select_token_id(
                 image_tokens + [bos_token_id],
@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
-                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                                [_NEWLINE_TOKEN_ID]) * nrows
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
                 return PromptUpdateDetails.select_token_id(
                     image_tokens + [bos_token_id],
@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
   from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
 
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+                                         info=YourProcessingInfo,
-+                                         dummy_inputs=YourDummyInputsBuilder)
++ @MULTIMODAL_REGISTRY.register_processor(
++     YourMultiModalProcessor,
++     info=YourProcessingInfo,
++     dummy_inputs=YourDummyInputsBuilder,
++ )
   class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
 
 
@@ -42,7 +42,7 @@ def register():
 
     ModelRegistry.register_model(
         "YourModelForCausalLM",
-        "your_code:YourModelForCausalLM"
+        "your_code:YourModelForCausalLM",
     )
 ```
 
 
@@ -15,6 +15,7 @@ Declare supported languages and capabilities:
 - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
 
 ??? code "supported_languages and supports_transcription_only"
+
     ```python
     from typing import ClassVar, Mapping, Literal
     import numpy as np
@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
 This is for controlling general behavior of the API when serving your model:
 
 ??? code "get_speech_to_text_config()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
 Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
 Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
 If your model requires a language and you want a default, override this method (see Whisper):
 
 ??? code "validate_language()"
+
     ```python
     @classmethod
     def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             logger.warning(
-                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+                "Defaulting to language='en'. If you wish to transcribe "
+                "audio in a different language, pass the `language` field "
+                "in the TranscriptionRequest."
+            )
             language = "en"
         return super().validate_language(language)
     ```
@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
 Provide a fast duration→token estimate to improve streaming usage statistics:
 
 ??? code "get_num_audio_tokens()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
 Relevant server logic:
 
 ??? code "_preprocess_speech_to_text()"
+
     ```python
     # vllm/entrypoints/openai/speech_to_text.py
     async def _preprocess_speech_to_text(...):
 
@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
 
 ??? console "Command"
 
-    ```python
+    ```bash
     curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
     -H 'Content-Type: application/json' \
     -H 'Authorization: <JWT TOKEN>' \
@@ -81,7 +81,7 @@ You should get a response like:
 
 ??? console "Response"
 
-    ```python
+    ```json
     {
         "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
         "result": {
 
@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
 
     client = OpenAI(
         base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
     )
 
     completion = client.chat.completions.create(
@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
                 "role": "user",
                 "content": "Compose a poem that explains the concept of recursion in programming.",
             }
-        ]
+        ],
     )
 
     print(completion.choices[0].message.content)
 
@@ -34,7 +34,7 @@ pip install vllm haystack-ai
         api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
         model="mistralai/Mistral-7B-Instruct-v0.1",
         api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-        generation_kwargs = {"max_tokens": 512}
+        generation_kwargs={"max_tokens": 512},
     )
 
     response = generator.run(
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def register():`
`42`	`42`
`43`	`43`	`ModelRegistry.register_model(`
`44`	`44`	`"YourModelForCausalLM",`
`45`		`- "your_code:YourModelForCausalLM"`
	`45`	`+ "your_code:YourModelForCausalLM",`
`46`	`46`	`)`
`47`	`47`	```
`48`	`48`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:`
`83`	`83`
`84`	`84`	`client = OpenAI(`
`85`	`85`	`base_url="https://gateway.<gateway domain>",`
`86`		`- api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"`
	`86`	`+ api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",`
`87`	`87`	`)`
`88`	`88`
`89`	`89`	`completion = client.chat.completions.create(`
`@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:`
`93`	`93`	`"role": "user",`
`94`	`94`	`"content": "Compose a poem that explains the concept of recursion in programming.",`
`95`	`95`	`}`
`96`		`- ]`
	`96`	`+ ],`
`97`	`97`	`)`
`98`	`98`
`99`	`99`	`print(completion.choices[0].message.content)`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ pip install vllm haystack-ai`
`34`	`34`	`api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),`
`35`	`35`	`model="mistralai/Mistral-7B-Instruct-v0.1",`
`36`	`36`	`api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",`
`37`		`- generation_kwargs = {"max_tokens": 512}`
	`37`	`+ generation_kwargs={"max_tokens": 512},`
`38`	`38`	`)`
`39`	`39`
`40`	`40`	`response = generator.run(`