vllm-project
diff --git a/‎benchmarks/backend_request_func.py‎
Lines changed: 13 additions & 14 deletions b/‎benchmarks/backend_request_func.py‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎benchmarks/benchmark_prefix_caching.py‎
Lines changed: 2 additions & 3 deletions b/‎benchmarks/benchmark_prefix_caching.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎benchmarks/benchmark_prioritization.py‎
Lines changed: 1 addition & 2 deletions b/‎benchmarks/benchmark_prioritization.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 3 additions & 4 deletions b/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎benchmarks/benchmark_utils.py‎
Lines changed: 8 additions & 8 deletions b/‎benchmarks/benchmark_utils.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎benchmarks/cutlass_benchmarks/sparse_benchmarks.py‎
Lines changed: 1 addition & 2 deletions b/‎benchmarks/cutlass_benchmarks/sparse_benchmarks.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/cutlass_benchmarks/w8a8_benchmarks.py‎
Lines changed: 5 additions & 6 deletions b/‎benchmarks/cutlass_benchmarks/w8a8_benchmarks.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎benchmarks/fused_kernels/layernorm_rms_benchmarks.py‎
Lines changed: 4 additions & 5 deletions b/‎benchmarks/fused_kernels/layernorm_rms_benchmarks.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎benchmarks/kernels/bench_per_token_quant_fp8.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/kernels/bench_per_token_quant_fp8.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_device_communicators.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/kernels/benchmark_device_communicators.py‎
Lines changed: 3 additions & 3 deletions
@@ -8,7 +8,6 @@
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union
 
 import aiohttp
 import huggingface_hub.constants
@@ -28,13 +27,13 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
     ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None
 
 
 @dataclass
@@ -52,7 +51,7 @@ class RequestFuncOutput:
 
 async def async_request_tgi(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -133,7 +132,7 @@ async def async_request_tgi(
 
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -204,7 +203,7 @@ async def async_request_trt_llm(
 
 async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
 
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -367,7 +366,7 @@ async def async_request_openai_completions(
 
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("chat/completions", "profile")), (
@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
 
 async def async_request_openai_audio(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     # Lazy import without PlaceholderModule to avoid vllm dep.
     import soundfile
@@ -610,7 +609,7 @@ def get_tokenizer(
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
     **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path is not None and not os.path.exists(
         pretrained_model_name_or_path
     ):
 
@@ -32,7 +32,6 @@
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import PreTrainedTokenizerBase
 
@@ -80,7 +79,7 @@ def sample_requests_from_dataset(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -128,7 +127,7 @@ def sample_requests_from_random(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
     prefix_len: int,
 ) -> list[Request]:
     requests = []
 
@@ -7,7 +7,6 @@
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -24,7 +23,7 @@ def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -32,7 +32,6 @@
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from typing import Optional
 
 import datasets
 import numpy as np
@@ -316,7 +315,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     actual_output_lens: list[int] = []
     total_input = 0
@@ -436,9 +435,9 @@ async def benchmark(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[str],
     ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
     structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
 
@@ -6,7 +6,7 @@
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any
 
 
 def convert_to_pytorch_benchmark_format(
@@ -92,7 +92,7 @@ class TimeCollector:
     def __init__(self, scale: int) -> None:
         self.cnt: int = 0
         self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
         self.scale = scale
         self.start_time: int = time.monotonic_ns()
 
@@ -104,22 +104,22 @@ def collect(self, v: int) -> None:
         else:
             self._max = max(self._max, v)
 
-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
         return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
 
-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
         return self._max / self.scale if self._max else "N/A"
 
-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
         return [self.avg(), self.max()]
 
     def __enter__(self) -> None:
         self.start_time = time.monotonic_ns()
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
     ) -> None:
         self.collect(time.monotonic_ns() - self.start_time)
@@ -6,8 +6,7 @@
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
 
@@ -6,8 +6,7 @@
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -53,7 +52,7 @@ def bench_int8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
@@ -108,7 +107,7 @@ def bench_fp8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
@@ -183,7 +182,7 @@ def bench(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
     dtype: torch.dtype,
     MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
 
@@ -3,10 +3,9 @@
 
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
     rms_norm_layer: RMSNorm,  # this stores the weights
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     out, _ = ops.rms_norm_dynamic_per_token_quant(
 
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch
 
 import pandas as pd
 
@@ -22,8 +22,8 @@
 import json
 import os
 import time
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Callable, Optional
 
 import torch
 import torch.distributed as dist
@@ -264,12 +264,12 @@ def benchmark_allreduce(
     def benchmark_allreduce_single(
         self,
         sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
         should_use_fn: Callable[[torch.Tensor], bool],
         context,
         num_warmup: int,
         num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
         """Benchmark method with CUDA graph optimization."""
         try:
             # Create test tensor (2D: sequence_length x hidden_size)