99from contextlib import contextmanager , nullcontext
1010from functools import cache
1111from pathlib import Path
12- from typing import Dict , List , Optional , Tuple
12+ from typing import Optional
1313
1414import torch
1515import uvloop
@@ -75,12 +75,12 @@ def lora_path_on_disk(lora_path: str) -> str:
7575 return get_adapter_absolute_path (lora_path )
7676
7777
78- lora_tokenizer_cache : Dict [int , AnyTokenizer ] = {}
78+ lora_tokenizer_cache : dict [int , AnyTokenizer ] = {}
7979
8080
8181def get_random_lora_request (
8282 args : argparse .Namespace
83- ) -> Tuple [LoRARequest , Optional [AnyTokenizer ]]:
83+ ) -> tuple [LoRARequest , Optional [AnyTokenizer ]]:
8484 global lora_tokenizer_cache
8585 lora_id = random .randint (1 , args .max_loras )
8686 lora_request = LoRARequest (lora_name = str (lora_id ),
@@ -92,7 +92,7 @@ def get_random_lora_request(
9292
9393
9494def sample_requests (tokenizer : PreTrainedTokenizerBase ,
95- args : argparse .Namespace ) -> List [SampleRequest ]:
95+ args : argparse .Namespace ) -> list [SampleRequest ]:
9696
9797 dataset_path : str = args .dataset
9898 num_requests : int = args .num_prompts
@@ -110,7 +110,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
110110 random .shuffle (dataset )
111111
112112 # Filter out sequences that are too long or too short
113- filtered_dataset : List [SampleRequest ] = []
113+ filtered_dataset : list [SampleRequest ] = []
114114 for data in tqdm (dataset ,
115115 total = len (filtered_dataset ),
116116 desc = "sampling requests" ):
@@ -166,7 +166,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
166166
167167
168168def run_vllm (
169- requests : List [SampleRequest ],
169+ requests : list [SampleRequest ],
170170 n : int ,
171171 engine_args : EngineArgs ,
172172) -> float :
@@ -222,8 +222,8 @@ def get_profiling_context(profile_dir: Optional[str] = None):
222222 llm = LLM (** dataclasses .asdict (engine_args ))
223223
224224 # Add the requests to the engine.
225- prompts : List [TextPrompt ] = []
226- sampling_params : List [SamplingParams ] = []
225+ prompts : list [TextPrompt ] = []
226+ sampling_params : list [SamplingParams ] = []
227227 for request in requests :
228228 prompts .append (
229229 TextPrompt (prompt = request .prompt ,
@@ -236,7 +236,7 @@ def get_profiling_context(profile_dir: Optional[str] = None):
236236 ignore_eos = True ,
237237 max_tokens = request .expected_output_len ,
238238 ))
239- lora_requests : Optional [List [LoRARequest ]] = None
239+ lora_requests : Optional [list [LoRARequest ]] = None
240240 if engine_args .enable_lora :
241241 lora_requests = [request .lora_request for request in requests ]
242242
@@ -274,7 +274,7 @@ def get_profiling_context(profile_dir: Optional[str] = None):
274274
275275
276276async def run_vllm_async (
277- requests : List [SampleRequest ],
277+ requests : list [SampleRequest ],
278278 n : int ,
279279 engine_args : AsyncEngineArgs ,
280280 disable_frontend_multiprocessing : bool = False ,
@@ -285,9 +285,9 @@ async def run_vllm_async(
285285 engine_args , disable_frontend_multiprocessing ) as llm :
286286
287287 # Add the requests to the engine.
288- prompts : List [TextPrompt ] = []
289- sampling_params : List [SamplingParams ] = []
290- lora_requests : List [Optional [LoRARequest ]] = []
288+ prompts : list [TextPrompt ] = []
289+ sampling_params : list [SamplingParams ] = []
290+ lora_requests : list [Optional [LoRARequest ]] = []
291291 for request in requests :
292292 prompts .append (
293293 TextPrompt (prompt = request .prompt ,
@@ -319,7 +319,7 @@ async def run_vllm_async(
319319
320320
321321def run_hf (
322- requests : List [SampleRequest ],
322+ requests : list [SampleRequest ],
323323 model : str ,
324324 tokenizer : PreTrainedTokenizerBase ,
325325 n : int ,
@@ -335,7 +335,7 @@ def run_hf(
335335
336336 pbar = tqdm (total = len (requests ))
337337 start = time .perf_counter ()
338- batch : List [str ] = []
338+ batch : list [str ] = []
339339 max_prompt_len = 0
340340 max_output_len = 0
341341 for i in range (len (requests )):
@@ -377,7 +377,7 @@ def run_hf(
377377
378378
379379def run_mii (
380- requests : List [SampleRequest ],
380+ requests : list [SampleRequest ],
381381 model : str ,
382382 tensor_parallel_size : int ,
383383 output_len : int ,
0 commit comments