diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 26b95ad05333..2b0654fa6d46 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
 ```python
 from vllm import LLM
 
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
-          tensor_parallel_size=2)
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```
 
 !!! warning
@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
 ```python
 from vllm import LLM
 
-llm = LLM(model="adept/fuyu-8b",
-          max_model_len=2048,
-          max_num_seqs=2)
+llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 ```
 
 ## Reduce CUDA Graphs
@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
 
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
-          enforce_eager=True)
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
 ```
 
 ## Adjust cache size
@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
 from vllm import LLM
 
 # Accept up to 3 images and 1 video per prompt
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"image": 3, "video": 1})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 3, "video": 1},
+)
 ```
 
 You can go a step further and disable unused modalities completely by setting its limit to zero.
@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
 from vllm import LLM
 
 # Accept any number of images but no videos
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"video": 0})
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"video": 0},
+)
 ```
 
 You can even run a multi-modal model for text-only inference:
@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
 from vllm import LLM
 
 # Don't accept images. Just text.
-llm = LLM(model="google/gemma-3-27b-it",
-          limit_mm_per_prompt={"image": 0})
+llm = LLM(
+    model="google/gemma-3-27b-it",
+    limit_mm_per_prompt={"image": 0},
+)
 ```
 
 ### Configurable options
@@ -173,14 +175,14 @@ Here are some examples:
 from vllm import LLM
 
 # Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_kwargs={"max_pixels": 768 * 768},  # Default is 1280 * 28 * 28
+)
 
 # Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
+llm = LLM(
+    model="OpenGVLab/InternVL2-2B",
+    mm_processor_kwargs={"max_dynamic_patch": 4},  # Default is 12
+)
 ```
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 5c74610ebd29..24c1efa61f28 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -100,7 +100,7 @@ from vllm import LLM
 llm = LLM(
     model="meta-llama/Llama-3.3-70B-Instruct,
     tensor_parallel_size=4,
-    pipeline_parallel_size=2
+    pipeline_parallel_size=2,
 )
 ```
 
@@ -257,18 +257,24 @@ Examples:
 
 ```python
 # Use a larger cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=8,
+)
 
 # Use a shared-memory based IPC cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          tensor_parallel_size=2,
-          mm_processor_cache_type="shm",
-          mm_processor_cache_gb=8)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    tensor_parallel_size=2,
+    mm_processor_cache_type="shm",
+    mm_processor_cache_gb=8,
+)
 
 # Disable the cache
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_cache_gb=0)
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    mm_processor_cache_gb=0,
+)
 ```
 
 ### Cache Placement
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index aafdb1058e03..a423f4e68337 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -73,8 +73,8 @@ def forward(
     self,
     input_ids: torch.Tensor,
     positions: torch.Tensor,
-    intermediate_tensors: Optional[IntermediateTensors] = None,
-    inputs_embeds: Optional[torch.Tensor] = None,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
 ) -> torch.Tensor:
     ...
 ```
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 724dc2284e28..721081dffb49 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -16,7 +16,7 @@ Further update the model as follows:
             ...
 
             @classmethod
-            def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            def get_placeholder_str(cls, modality: str, i: int) -> str | None:
                 if modality.startswith("image"):
                     return "<image>"
 
@@ -45,14 +45,14 @@ Further update the model as follows:
             ...
 
             def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-
                 assert self.vision_encoder is not None
                 image_features = self.vision_encoder(image_input)
                 return self.multi_modal_projector(image_features)
 
             def get_multimodal_embeddings(
-                    self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
+                self,
+                **kwargs: object,
+            ) -> MultiModalEmbeddings | None:
                 # Validate the multimodal input keyword arguments
                 image_input = self._parse_and_validate_image_input(**kwargs)
                 if image_input is None:
@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
 For example, if the model supports any number of images but only one video per prompt:
 
 ```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+def get_supported_mm_limits(self) -> Mapping[str, int | None]:
     return {"image": None, "video": 1}
 ```
 
@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
             self,
             seq_len: int,
             mm_counts: Mapping[str, int],
-            mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+            mm_options: Mapping[str, BaseDummyOptions] | None = None,
         ) -> MultiModalDataDict:
             num_images = mm_counts.get("image", 0)
 
@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
     ```python
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
-        return ImageSize(width=image_processor.size["width"],
-                            height=image_processor.size["height"])
+        return ImageSize(
+            width=image_processor.size["width"],
+            height=image_processor.size["height"],
+        )
     ```
 
     Fuyu does not expect image placeholders in the inputs to HF processor, so
@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
 
             return {
                 "image":
-                self._get_dummy_images(width=target_width,
-                                    height=target_height,
-                                    num_images=num_images,
-                                    overrides=image_overrides)
+                self._get_dummy_images(
+                    width=target_width,
+                    height=target_height,
+                    num_images=num_images,
+                    overrides=image_overrides,
+                )
             }
         ```
 
@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
-            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                            [_NEWLINE_TOKEN_ID]) * nrows
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
             return PromptUpdateDetails.select_token_id(
                 image_tokens + [bos_token_id],
@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
                     image_width=image_size.width,
                     image_height=image_size.height,
                 )
-                image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                                [_NEWLINE_TOKEN_ID]) * nrows
+                image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
 
                 return PromptUpdateDetails.select_token_id(
                     image_tokens + [bos_token_id],
@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
   from vllm.model_executor.models.interfaces import SupportsMultiModal
 + from vllm.multimodal import MULTIMODAL_REGISTRY
 
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+                                         info=YourProcessingInfo,
-+                                         dummy_inputs=YourDummyInputsBuilder)
++ @MULTIMODAL_REGISTRY.register_processor(
++     YourMultiModalProcessor,
++     info=YourProcessingInfo,
++     dummy_inputs=YourDummyInputsBuilder,
++ )
   class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
 
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
index 35f35ffa4cde..3bb4f961ef15 100644
--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@@ -42,7 +42,7 @@ def register():
 
     ModelRegistry.register_model(
         "YourModelForCausalLM",
-        "your_code:YourModelForCausalLM"
+        "your_code:YourModelForCausalLM",
     )
 ```
 
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index 4ce748ce1fed..59f14a5ea27b 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -15,6 +15,7 @@ Declare supported languages and capabilities:
 - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
 
 ??? code "supported_languages and supports_transcription_only"
+
     ```python
     from typing import ClassVar, Mapping, Literal
     import numpy as np
@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
 This is for controlling general behavior of the API when serving your model:
 
 ??? code "get_speech_to_text_config()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
 Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
 Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
 
 ??? code "get_generation_prompt()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
 If your model requires a language and you want a default, override this method (see Whisper):
 
 ??? code "validate_language()"
+
     ```python
     @classmethod
     def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             logger.warning(
-                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+                "Defaulting to language='en'. If you wish to transcribe "
+                "audio in a different language, pass the `language` field "
+                "in the TranscriptionRequest."
+            )
             language = "en"
         return super().validate_language(language)
     ```
@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
 Provide a fast duration→token estimate to improve streaming usage statistics:
 
 ??? code "get_num_audio_tokens()"
+
     ```python
     class YourASRModel(nn.Module, SupportsTranscription):
         ...
@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
 Relevant server logic:
 
 ??? code "_preprocess_speech_to_text()"
+
     ```python
     # vllm/entrypoints/openai/speech_to_text.py
     async def _preprocess_speech_to_text(...):
diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
index 1f233c3204a1..960347d9525c 100644
--- a/docs/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
 
 ??? console "Command"
 
-    ```python
+    ```bash
     curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
     -H 'Content-Type: application/json' \
     -H 'Authorization: <JWT TOKEN>' \
@@ -81,7 +81,7 @@ You should get a response like:
 
 ??? console "Response"
 
-    ```python
+    ```json
     {
         "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
         "result": {
diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index fe4d87f78f2a..9d2c7f5bb565 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
 
     client = OpenAI(
         base_url="https://gateway.<gateway domain>",
-        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
+        api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
     )
 
     completion = client.chat.completions.create(
@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
                 "role": "user",
                 "content": "Compose a poem that explains the concept of recursion in programming.",
             }
-        ]
+        ],
     )
 
     print(completion.choices[0].message.content)
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index 836305cf15c4..b53b829d6d3c 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -34,7 +34,7 @@ pip install vllm haystack-ai
         api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
         model="mistralai/Mistral-7B-Instruct-v0.1",
         api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
-        generation_kwargs = {"max_tokens": 512}
+        generation_kwargs={"max_tokens": 512},
     )
 
     response = generator.run(
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
index 75a234bdf142..d39bb9a899c8 100644
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
     import os
 
     client = OpenAI(
-        base_url = DEPLOYMENT_URL,
-        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
+        base_url=DEPLOYMENT_URL,
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
     )
 
     chat_completion = client.chat.completions.create(
-        model = "HuggingFaceTB/SmolLM3-3B",
-        messages = [
+        model="HuggingFaceTB/SmolLM3-3B",
+        messages=[
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "text",
-                        "text": "Give me a brief explanation of gravity in simple terms."
+                        "text": "Give me a brief explanation of gravity in simple terms.",
                     }
-                ]
+                ],
             }
         ],
-        stream = True
+        stream=True,
     )
 
     for message in chat_completion:
-        print(message.choices[0].delta.content, end = "")
+        print(message.choices[0].delta.content, end="")
     ```
 
 !!! note
@@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
     import os
 
     client = OpenAI(
-        base_url = DEPLOYMENT_URL,
-        api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
+        base_url=DEPLOYMENT_URL,
+        api_key=os.environ["HF_TOKEN"],  # https://huggingface.co/settings/tokens
     )
 
     chat_completion = client.chat.completions.create(
-        model = "ibm-granite/granite-docling-258M",
-        messages = [
+        model="ibm-granite/granite-docling-258M",
+        messages=[
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image_url",
                         "image_url": {
-                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
-                        }
+                            "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
+                        },
                     },
                     {
                         "type": "text",
-                        "text": "Convert this page to docling."
-                    }
+                        "text": "Convert this page to docling.",
+                    },
                 ]
             }
         ],
-        stream = True
+        stream=True,
     )
 
     for message in chat_completion:
-        print(message.choices[0].delta.content, end = "")
+        print(message.choices[0].delta.content, end="")
     ```
 
 !!! note
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index 0d6c3729911a..9ea7c0373d2a 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -36,15 +36,16 @@ pip install vllm litellm
     ```python
     import litellm 
 
-    messages = [{ "content": "Hello, how are you?","role": "user"}]
+    messages = [{"content": "Hello, how are you?", "role": "user"}]
 
     # hosted_vllm is prefix key word and necessary
     response = litellm.completion(
-                model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
-                messages=messages,
-                api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
-                temperature=0.2,
-                max_tokens=80)
+        model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+        messages=messages,
+        api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+        temperature=0.2,
+        max_tokens=80,
+    )
 
     print(response)
     ```
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index d86ab1600f12..37f90ef08f32 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -40,7 +40,7 @@ pip install -U vllm \
 
 1. Run the script
 
-    ```python
+    ```bash
     python retrieval_augmented_generation_with_langchain.py
     ```
 
@@ -78,6 +78,6 @@ pip install vllm \
 
 1. Run the script:
 
-    ```python
+    ```bash
     python retrieval_augmented_generation_with_llamaindex.py
     ```
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index f88a29f6eadd..315746b0ef67 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -106,9 +106,11 @@ The dispatch code looks like:
 batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
 runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
 # execution
-with set_forward_context(..., 
-            cudagraph_runtime_mode=runtime_mode, 
-            batch_descriptor=batch_descriptor):
+with set_forward_context(
+    ..., 
+    cudagraph_runtime_mode=runtime_mode, 
+    batch_descriptor=batch_descriptor,
+):
      output = self.model(...)
 ```
 
@@ -202,10 +204,10 @@ from vllm.config import CUDAGraphMode
 
 compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
-            model="meta-llama/Llama-3.1-8B-Instruct",
-            dtype='auto',
-            compilation_config = compilation_config,
-        )
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    dtype="auto",
+    compilation_config=compilation_config,
+)
 sampling_params = vllm.SamplingParams(
     temperature=0,  # greedy decoding
     max_tokens=1024,
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index e70ee4a076e5..682fc5c413e2 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
 IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
 
 ```python
-IOProcessorInput = TypeVar('IOProcessorInput')
-IOProcessorOutput = TypeVar('IOProcessorOutput')
+IOProcessorInput = TypeVar("IOProcessorInput")
+IOProcessorOutput = TypeVar("IOProcessorOutput")
 
 class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
 
@@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     def pre_process(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         raise NotImplementedError
 
     async def pre_process_async(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         return self.pre_process(prompt, request_id, **kwargs)
 
     @abstractmethod
-    def post_process(self,
-                     model_output: Sequence[PoolingRequestOutput],
-                     request_id: Optional[str] = None,
-                     **kwargs) -> IOProcessorOutput:
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: str | None = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
         raise NotImplementedError
 
     async def post_process_async(
         self,
         model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
         collected_output = [item async for i, item in model_output]
@@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
 
     @abstractmethod
     def output_to_response(
-            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        self, plugin_output: IOProcessorOutput
+    ) -> IOProcessorResponse:
         raise NotImplementedError
 ```
 
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 90b2fd32f297..c4a2d72a2f4a 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -478,15 +478,17 @@ us with:
 
 ```python
 if seq_group.is_finished():
-    if (seq_group.metrics.first_scheduled_time is not None and
-            seq_group.metrics.first_token_time is not None):
+    if (
+        seq_group.metrics.first_scheduled_time is not None
+        and seq_group.metrics.first_token_time is not None
+    ):
         time_queue_requests.append(
             seq_group.metrics.first_scheduled_time -
-            seq_group.metrics.arrival_time)
+            seq_group.metrics.arrival_time
+        )
     ...
     if seq_group.metrics.time_in_queue is not None:
-        time_in_queue_requests.append(
-            seq_group.metrics.time_in_queue)
+        time_in_queue_requests.append(seq_group.metrics.time_in_queue)
 ```
 
 This seems duplicative, and one of them should be removed. The latter
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 9941837bf165..270699df623e 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -112,8 +112,8 @@ class KVCacheBlock:
     ref_cnt: int
 
     # The pointers to form a doubly linked list for the free queue.
-    prev_free_block: Optional["KVCacheBlock"] = None
-    next_free_block: Optional["KVCacheBlock"] = None
+    prev_free_block: "KVCacheBlock | None" = None
+    next_free_block: "KVCacheBlock | None" = None
 ```
 
 There are two design points to highlight:
diff --git a/docs/features/lora.md b/docs/features/lora.md
index db794b2ebd71..d3b44520a5a7 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
     sampling_params = SamplingParams(
         temperature=0,
         max_tokens=256,
-        stop=["[/assistant]"]
+        stop=["[/assistant]"],
     )
 
     prompts = [
@@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
     outputs = llm.generate(
         prompts,
         sampling_params,
-        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
     )
     ```
 
@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
                 lora_request = LoRARequest(
                     lora_name=lora_name,
                     lora_path=local_path,
-                    lora_int_id=abs(hash(lora_name))
+                    lora_int_id=abs(hash(lora_name)),
                 )
                 return lora_request
         ```
@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
         if has_audio:
             question = f"<|audio|>{question}"
         chat = [
-            {
-                "role": "user",
-                "content": question
-            }
+            {"role": "user", "content": question},
         ]
         return tokenizer.apply_chat_template(chat, tokenize=False)
 
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index dcc5ea3b9096..8f75f714d4b0 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": {
-            "image": [image1, image2]
-        },
+        "multi_modal_data": {"image": [image1, image2]},
     })
 
     for o in outputs:
@@ -183,21 +181,24 @@ conversation = [
     {"role": "assistant", "content": "Hello! How can I assist you today?"},
     {
         "role": "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        },{
-            "type": "image_pil",
-            "image_pil": image_pil
-        }, {
-            "type": "image_embeds",
-            "image_embeds": image_embeds
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }],
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            },
+            {
+                "type": "image_pil",
+                "image_pil": image_pil,
+            },
+            {
+                "type": "image_embeds",
+                "image_embeds": image_embeds,
+            },
+            {
+                "type": "text",
+                "text": "What's in these images?",
+            },
+        ],
     },
 ]
 
@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
     message = {
         "role": "user",
         "content": [
-            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+            {
+                "type": "text",
+                "text": "Describe this set of frames. Consider the frames to be a part of the same video.",
+            },
         ],
     }
     for i in range(len(video_frames)):
@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
     )
 
     # Custom brand color background (e.g., blue)
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
+        media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
     )
     ```
 
@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
         limit_mm_per_prompt={"video": 1},
     )
 
-    sampling_params = SamplingParams(
-        max_tokens=1024,
-    )
+    sampling_params = SamplingParams(max_tokens=1024)
 
     video_messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant.",
+        },
+        {
+            "role": "user",
+            "content": [
                 {"type": "text", "text": "describe this video."},
                 {
                     "type": "video",
                     "video": video_path,
                     "total_pixels": 20480 * 28 * 28,
-                    "min_pixels": 16 * 28 * 28
-                }
+                    "min_pixels": 16 * 28 * 28,
+                },
             ]
         },
     ]
@@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What’s in this image?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        url": image_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    # NOTE: The prompt formatting with the image token `<image>` is not needed
+                    # since the prompt will be processed automatically by the API server.
+                    {
+                        "type": "text",
+                        "text": "What’s in this image?",
                     },
-                    "uuid": image_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                        "uuid": image_url,  # Optional
+                    },
+                ],
+            }
+        ],
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
@@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows:
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What are the animals in these images?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_duck
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the animals in these images?",
                     },
-                    "uuid": image_url_duck # Optional
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_lion
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
+                        "uuid": image_url_duck,  # Optional
                     },
-                    "uuid": image_url_lion # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
+                        "uuid": image_url_lion,  # Optional
+                    },
+                ],
+            }
+        ],
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
     ```
@@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
 
     ## Use video url in the payload
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this video?"
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": video_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this video?",
                     },
-                    "uuid": video_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
+                        "uuid": video_url,  # Optional
+                    },
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
     audio_base64 = encode_base64_content_from_url(audio_url)
 
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        "data": audio_base64,
-                        "format": "wav"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this audio?",
                     },
-                    "uuid": audio_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                        "uuid": audio_url,  # Optional
+                    },
+                ],
+            },
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
 
     ```python
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this audio?",
                     },
-                    "uuid": audio_url # Optional
-                },
-            ],
-        }],
+                    {
+                        "type": "audio_url",
+                        "audio_url": {"url": audio_url},
+                        "uuid": audio_url,  # Optional
+                    },
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
 
     # Basic usage - this is equivalent to the LLaVA example for offline inference
     model = "llava-hf/llava-1.5-7b-hf"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": f"{base64_image_embedding}",
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
     model = "Qwen/Qwen2-VL-2B-Instruct"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_grid_thw": f"{base64_image_grid_thw}",  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
-    embeds =  {
+    embeds = {
         "type": "image_embeds",
         "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}" , # Required
-            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+            "image_embeds": f"{base64_image_embedding}",  # Required
+            "image_sizes": f"{base64_image_sizes}",  # Required by openbmb/MiniCPM-V-2_6
         },
-        "uuid": image_url # Optional
+        "uuid": image_url,  # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": [
             {
-                "type": "text",
-                "text": "What's in this image?",
+                "role": "system",
+                "content": "You are a helpful assistant.",
             },
-            embeds,
-            ],
-        },
-    ],
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this image?",
+                    },
+                    embeds,
+                ],
+            },
+        ],
         model=model,
     )
     ```
@@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
         {
             "type": "image_embeds",
             "image_embeds": None,
-            "uuid": image_uuid
+            "uuid": image_uuid,
         },
 
         # input_audio:
         {
             "type": "input_audio",
             "input_audio": None,
-            "uuid": audio_uuid
+            "uuid": audio_uuid,
         },
 
         # PIL Image:
         {
             "type": "image_pil",
-            "image_pil": None
-            "uuid": image_uuid
-        }
+            "image_pil": None,
+            "uuid": image_uuid,
+        },
 
     ```
 
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 389b3cb21ef5..ab04a1efcc08 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
     # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
     # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
     # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-    stream = client.chat.completions.create(model=model,
-                                            messages=messages,
-                                            stream=True)
+    stream = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stream=True,
+    )
 
     print("client: Start streaming chat completions...")
     printed_reasoning_content = False
@@ -159,27 +161,29 @@ The reasoning content is also available when both tool calling and the reasoning
 
     client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
 
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
-                },
-                "required": ["location", "unit"]
-            }
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                }
+            },
         }
-    }]
+    ]
 
     response = client.chat.completions.create(
         model=client.models.list().data[0].id,
         messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
         tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
     )
 
     print(response)
@@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
             previous_token_ids: Sequence[int],
             current_token_ids: Sequence[int],
             delta_token_ids: Sequence[int],
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
             """
             Instance method that should be implemented for extracting reasoning
             from an incomplete response; for use when handling reasoning calls and
@@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
             """
 
         def extract_reasoning_content(
-                self, model_output: str, request: ChatCompletionRequest
-        ) -> tuple[Optional[str], Optional[str]]:
+            self,
+            model_output: str,
+            request: ChatCompletionRequest | ResponsesRequest,
+        ) -> tuple[str | None, str | None]:
             """
             Extract reasoning content from a complete model-generated string.
 
@@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
         @classmethod
         def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-            return cls(start_token_id=tokenizer.encode(
-                "<think>", add_special_tokens=False)[0],
-                    end_token_id=tokenizer.encode("</think>",
-                                                    add_special_tokens=False)[0])
+            return cls(
+                start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
+                end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
+            )
 
         def is_reasoning_end(self, input_ids: list[int]) -> bool:
             return self.end_token_id in input_ids
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index e57a8945971f..02a700c09d39 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools:
         return f"Getting the weather for {location} in {unit}..."
     tool_functions = {"get_weather": get_weather}
 
-    tools = [{
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                    },
+                    "required": ["location", "unit"],
                 },
-                "required": ["location", "unit"]
-            }
-        }
-    }]
+            },
+        },
+    ]
 
     response = client.chat.completions.create(
         model=client.models.list().data[0].id,
         messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
         tools=tools,
-        tool_choice="auto"
+        tool_choice="auto",
     )
 
     tool_call = response.choices[0].message.tool_calls[0].function
@@ -402,8 +404,7 @@ Here is a summary of a plugin file:
 
         # adjust request. e.g.: set skip special tokens
         # to False for tool call output.
-        def adjust_request(
-                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
             return request
 
         # implement the tool call parse for stream call
@@ -416,7 +417,7 @@ Here is a summary of a plugin file:
             current_token_ids: Sequence[int],
             delta_token_ids: Sequence[int],
             request: ChatCompletionRequest,
-        ) -> Union[DeltaMessage, None]:
+        ) -> DeltaMessage | None:
             return delta
 
         # implement the tool parse for non-stream call