@@ -48,191 +48,6 @@ class RequestFuncOutput:
4848 error : str = ""
4949
5050
51- async def async_request_tgi (
52- request_func_input : RequestFuncInput ,
53- pbar : Optional [tqdm ] = None ,
54- ) -> RequestFuncOutput :
55- api_url = request_func_input .api_url
56- assert api_url .endswith ("generate_stream" )
57-
58- async with aiohttp .ClientSession (trust_env = True ,
59- timeout = AIOHTTP_TIMEOUT ) as session :
60- params = {
61- "best_of" : request_func_input .best_of ,
62- "max_new_tokens" : request_func_input .output_len ,
63- "do_sample" : True ,
64- "temperature" : 0.01 , # TGI does not accept 0.0 temperature.
65- "top_p" : 0.99 , # TGI does not accept 1.0 top_p.
66- "truncate" : request_func_input .prompt_len ,
67- # TGI does not accept ignore_eos flag.
68- }
69- payload = {
70- "inputs" : request_func_input .prompt ,
71- "parameters" : params ,
72- }
73- output = RequestFuncOutput ()
74- output .prompt_len = request_func_input .prompt_len
75-
76- ttft = 0.0
77- st = time .perf_counter ()
78- most_recent_timestamp = st
79- try :
80- async with session .post (url = api_url , json = payload ) as response :
81- if response .status == 200 :
82- async for chunk_bytes in response .content :
83- chunk_bytes = chunk_bytes .strip ()
84- if not chunk_bytes :
85- continue
86- chunk_bytes = chunk_bytes .decode ("utf-8" )
87-
88- # NOTE: Sometimes TGI returns a ping response without
89- # any data, we should skip it.
90- if chunk_bytes .startswith (":" ):
91- continue
92- chunk = chunk_bytes .removeprefix ("data:" )
93-
94- data = json .loads (chunk )
95- timestamp = time .perf_counter ()
96- # First token
97- if ttft == 0.0 :
98- ttft = time .perf_counter () - st
99- output .ttft = ttft
100-
101- # Decoding phase
102- else :
103- output .itl .append (timestamp -
104- most_recent_timestamp )
105-
106- most_recent_timestamp = timestamp
107-
108- output .latency = most_recent_timestamp - st
109- output .success = True
110- output .generated_text = data ["generated_text" ]
111- else :
112- output .error = response .reason or ""
113- output .success = False
114- except Exception :
115- output .success = False
116- exc_info = sys .exc_info ()
117- output .error = "" .join (traceback .format_exception (* exc_info ))
118-
119- if pbar :
120- pbar .update (1 )
121- return output
122-
123-
124- async def async_request_trt_llm (
125- request_func_input : RequestFuncInput ,
126- pbar : Optional [tqdm ] = None ,
127- ) -> RequestFuncOutput :
128- api_url = request_func_input .api_url
129- assert api_url .endswith ("generate_stream" )
130-
131- async with aiohttp .ClientSession (trust_env = True ,
132- timeout = AIOHTTP_TIMEOUT ) as session :
133- assert request_func_input .best_of == 1
134- payload = {
135- "accumulate_tokens" : True ,
136- "text_input" : request_func_input .prompt ,
137- "temperature" : 0.0 ,
138- "top_p" : 1.0 ,
139- "max_tokens" : request_func_input .output_len ,
140- "stream" : True ,
141- }
142- if request_func_input .ignore_eos :
143- payload ["min_length" ] = request_func_input .output_len
144- output = RequestFuncOutput ()
145- output .prompt_len = request_func_input .prompt_len
146-
147- ttft = 0.0
148- st = time .perf_counter ()
149- most_recent_timestamp = st
150- try :
151- async with session .post (url = api_url , json = payload ) as response :
152- if response .status == 200 :
153- async for chunk_bytes in response .content :
154- chunk_bytes = chunk_bytes .strip ()
155- if not chunk_bytes :
156- continue
157-
158- chunk = chunk_bytes .decode ("utf-8" ).removeprefix (
159- "data:" )
160-
161- data = json .loads (chunk )
162- output .generated_text += data ["text_output" ]
163- timestamp = time .perf_counter ()
164- # First token
165- if ttft == 0.0 :
166- ttft = timestamp - st
167- output .ttft = ttft
168-
169- # Decoding phase
170- else :
171- output .itl .append (timestamp -
172- most_recent_timestamp )
173-
174- most_recent_timestamp = timestamp
175-
176- output .latency = most_recent_timestamp - st
177- output .success = True
178-
179- else :
180- output .error = response .reason or ""
181- output .success = False
182- except Exception :
183- output .success = False
184- exc_info = sys .exc_info ()
185- output .error = "" .join (traceback .format_exception (* exc_info ))
186-
187- if pbar :
188- pbar .update (1 )
189- return output
190-
191-
192- async def async_request_deepspeed_mii (
193- request_func_input : RequestFuncInput ,
194- pbar : Optional [tqdm ] = None ,
195- ) -> RequestFuncOutput :
196- async with aiohttp .ClientSession (trust_env = True ,
197- timeout = AIOHTTP_TIMEOUT ) as session :
198- assert request_func_input .best_of == 1
199-
200- payload = {
201- "prompt" : request_func_input .prompt ,
202- "max_tokens" : request_func_input .output_len ,
203- "temperature" : 0.01 , # deepspeed-mii does not accept 0.0 temp.
204- "top_p" : 1.0 ,
205- }
206- output = RequestFuncOutput ()
207- output .prompt_len = request_func_input .prompt_len
208-
209- # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
210- # will use 0 as placeholder.
211- # See https://github.com/microsoft/DeepSpeed-MII/pull/311
212- output .ttft = 0
213-
214- st = time .perf_counter ()
215- try :
216- async with session .post (url = request_func_input .api_url ,
217- json = payload ) as response :
218- if response .status == 200 :
219- parsed_resp = await response .json ()
220- output .latency = time .perf_counter () - st
221- output .generated_text = parsed_resp ["text" ][0 ]
222- output .success = True
223- else :
224- output .error = response .reason or ""
225- output .success = False
226- except Exception :
227- output .success = False
228- exc_info = sys .exc_info ()
229- output .error = "" .join (traceback .format_exception (* exc_info ))
230-
231- if pbar :
232- pbar .update (1 )
233- return output
234-
235-
23651async def async_request_openai_completions (
23752 request_func_input : RequestFuncInput ,
23853 pbar : Optional [tqdm ] = None ,
@@ -332,158 +147,10 @@ async def async_request_openai_completions(
332147 return output
333148
334149
335- async def async_request_openai_chat_completions (
336- request_func_input : RequestFuncInput ,
337- pbar : Optional [tqdm ] = None ,
338- ) -> RequestFuncOutput :
339- api_url = request_func_input .api_url
340- assert api_url .endswith (
341- "chat/completions"
342- ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
343-
344- async with aiohttp .ClientSession (trust_env = True ,
345- timeout = AIOHTTP_TIMEOUT ) as session :
346- content = [{"type" : "text" , "text" : request_func_input .prompt }]
347- if request_func_input .multi_modal_content :
348- content .append (request_func_input .multi_modal_content )
349- payload = {
350- "model" : request_func_input .model_name \
351- if request_func_input .model_name else request_func_input .model ,
352- "messages" : [
353- {
354- "role" : "user" ,
355- "content" : content
356- },
357- ],
358- "temperature" : 0.0 ,
359- "max_completion_tokens" : request_func_input .output_len ,
360- "stream" : True ,
361- "stream_options" : {
362- "include_usage" : True ,
363- },
364- }
365- if request_func_input .ignore_eos :
366- payload ["ignore_eos" ] = request_func_input .ignore_eos
367- if request_func_input .extra_body :
368- payload .update (request_func_input .extra_body )
369- headers = {
370- "Content-Type" : "application/json" ,
371- "Authorization" : f"Bearer { os .environ .get ('OPENAI_API_KEY' )} " ,
372- }
373-
374- output = RequestFuncOutput ()
375- output .prompt_len = request_func_input .prompt_len
376-
377- generated_text = ""
378- ttft = 0.0
379- st = time .perf_counter ()
380- most_recent_timestamp = st
381- try :
382- async with session .post (url = api_url , json = payload ,
383- headers = headers ) as response :
384- if response .status == 200 :
385- async for chunk_bytes in response .content :
386- chunk_bytes = chunk_bytes .strip ()
387- if not chunk_bytes :
388- continue
389-
390- chunk = chunk_bytes .decode ("utf-8" ).removeprefix (
391- "data: " )
392- if chunk != "[DONE]" :
393- timestamp = time .perf_counter ()
394- data = json .loads (chunk )
395-
396- if choices := data .get ("choices" ):
397- content = choices [0 ]["delta" ].get ("content" )
398- # First token
399- if ttft == 0.0 :
400- ttft = timestamp - st
401- output .ttft = ttft
402-
403- # Decoding phase
404- else :
405- output .itl .append (timestamp -
406- most_recent_timestamp )
407-
408- generated_text += content or ""
409- elif usage := data .get ("usage" ):
410- output .output_tokens = usage .get (
411- "completion_tokens" )
412-
413- most_recent_timestamp = timestamp
414-
415- output .generated_text = generated_text
416- output .success = True
417- output .latency = most_recent_timestamp - st
418- else :
419- output .error = response .reason or ""
420- output .success = False
421- except Exception :
422- output .success = False
423- exc_info = sys .exc_info ()
424- output .error = "" .join (traceback .format_exception (* exc_info ))
425-
426- if pbar :
427- pbar .update (1 )
428- return output
429-
430-
431- def get_model (pretrained_model_name_or_path : str ) -> str :
432- if os .getenv ('VLLM_USE_MODELSCOPE' , 'False' ).lower () == 'true' :
433- from modelscope import snapshot_download
434-
435- # Use file lock to prevent multiple processes from
436- # downloading the same model weights at the same time.
437- with get_lock (pretrained_model_name_or_path ):
438- model_path = snapshot_download (
439- model_id = pretrained_model_name_or_path ,
440- local_files_only = huggingface_hub .constants .HF_HUB_OFFLINE ,
441- ignore_file_pattern = [".*.pt" , ".*.safetensors" , ".*.bin" ])
442-
443- return model_path
444- return pretrained_model_name_or_path
445-
446-
447- def get_tokenizer (
448- pretrained_model_name_or_path : str ,
449- tokenizer_mode : str = "auto" ,
450- trust_remote_code : bool = False ,
451- ** kwargs ,
452- ) -> Union [PreTrainedTokenizer , PreTrainedTokenizerFast ]:
453- if pretrained_model_name_or_path is not None and not os .path .exists (
454- pretrained_model_name_or_path ):
455- pretrained_model_name_or_path = get_model (
456- pretrained_model_name_or_path )
457- if tokenizer_mode == "slow" :
458- if kwargs .get ("use_fast" , False ):
459- raise ValueError (
460- "Cannot use the fast tokenizer in slow tokenizer mode." )
461- kwargs ["use_fast" ] = False
462- if tokenizer_mode == "mistral" :
463- try :
464- from vllm .transformers_utils .tokenizer import MistralTokenizer
465- except ImportError as e :
466- raise ImportError ("MistralTokenizer requires vllm package.\n "
467- "Please install it with `pip install vllm` "
468- "to use mistral tokenizer mode." ) from e
469- return MistralTokenizer .from_pretrained (
470- str (pretrained_model_name_or_path ))
471- else :
472- return AutoTokenizer .from_pretrained (
473- pretrained_model_name_or_path ,
474- trust_remote_code = trust_remote_code ,
475- ** kwargs ,
476- )
477-
478-
479150ASYNC_REQUEST_FUNCS = {
480- "tgi" : async_request_tgi ,
481151 "vllm" : async_request_openai_completions ,
482152 "lmdeploy" : async_request_openai_completions ,
483- "deepspeed-mii" : async_request_deepspeed_mii ,
484153 "openai" : async_request_openai_completions ,
485- "openai-chat" : async_request_openai_chat_completions ,
486- "tensorrt-llm" : async_request_trt_llm ,
487154 "scalellm" : async_request_openai_completions ,
488155 "sglang" : async_request_openai_completions ,
489156}
0 commit comments