Skip to content

Commit d5bc88b

Browse files
committed
Compacted utility functions (vllm-project#13840)
Signed-off-by: Randy Chen <acad.randyjhc@gmail.com>
1 parent b9bbd7d commit d5bc88b

File tree

2 files changed

+0
-348
lines changed

2 files changed

+0
-348
lines changed

vllm/benchmarks/backend_request_func.py

Lines changed: 0 additions & 333 deletions
Original file line numberDiff line numberDiff line change
@@ -48,191 +48,6 @@ class RequestFuncOutput:
4848
error: str = ""
4949

5050

51-
async def async_request_tgi(
52-
request_func_input: RequestFuncInput,
53-
pbar: Optional[tqdm] = None,
54-
) -> RequestFuncOutput:
55-
api_url = request_func_input.api_url
56-
assert api_url.endswith("generate_stream")
57-
58-
async with aiohttp.ClientSession(trust_env=True,
59-
timeout=AIOHTTP_TIMEOUT) as session:
60-
params = {
61-
"best_of": request_func_input.best_of,
62-
"max_new_tokens": request_func_input.output_len,
63-
"do_sample": True,
64-
"temperature": 0.01, # TGI does not accept 0.0 temperature.
65-
"top_p": 0.99, # TGI does not accept 1.0 top_p.
66-
"truncate": request_func_input.prompt_len,
67-
# TGI does not accept ignore_eos flag.
68-
}
69-
payload = {
70-
"inputs": request_func_input.prompt,
71-
"parameters": params,
72-
}
73-
output = RequestFuncOutput()
74-
output.prompt_len = request_func_input.prompt_len
75-
76-
ttft = 0.0
77-
st = time.perf_counter()
78-
most_recent_timestamp = st
79-
try:
80-
async with session.post(url=api_url, json=payload) as response:
81-
if response.status == 200:
82-
async for chunk_bytes in response.content:
83-
chunk_bytes = chunk_bytes.strip()
84-
if not chunk_bytes:
85-
continue
86-
chunk_bytes = chunk_bytes.decode("utf-8")
87-
88-
# NOTE: Sometimes TGI returns a ping response without
89-
# any data, we should skip it.
90-
if chunk_bytes.startswith(":"):
91-
continue
92-
chunk = chunk_bytes.removeprefix("data:")
93-
94-
data = json.loads(chunk)
95-
timestamp = time.perf_counter()
96-
# First token
97-
if ttft == 0.0:
98-
ttft = time.perf_counter() - st
99-
output.ttft = ttft
100-
101-
# Decoding phase
102-
else:
103-
output.itl.append(timestamp -
104-
most_recent_timestamp)
105-
106-
most_recent_timestamp = timestamp
107-
108-
output.latency = most_recent_timestamp - st
109-
output.success = True
110-
output.generated_text = data["generated_text"]
111-
else:
112-
output.error = response.reason or ""
113-
output.success = False
114-
except Exception:
115-
output.success = False
116-
exc_info = sys.exc_info()
117-
output.error = "".join(traceback.format_exception(*exc_info))
118-
119-
if pbar:
120-
pbar.update(1)
121-
return output
122-
123-
124-
async def async_request_trt_llm(
125-
request_func_input: RequestFuncInput,
126-
pbar: Optional[tqdm] = None,
127-
) -> RequestFuncOutput:
128-
api_url = request_func_input.api_url
129-
assert api_url.endswith("generate_stream")
130-
131-
async with aiohttp.ClientSession(trust_env=True,
132-
timeout=AIOHTTP_TIMEOUT) as session:
133-
assert request_func_input.best_of == 1
134-
payload = {
135-
"accumulate_tokens": True,
136-
"text_input": request_func_input.prompt,
137-
"temperature": 0.0,
138-
"top_p": 1.0,
139-
"max_tokens": request_func_input.output_len,
140-
"stream": True,
141-
}
142-
if request_func_input.ignore_eos:
143-
payload["min_length"] = request_func_input.output_len
144-
output = RequestFuncOutput()
145-
output.prompt_len = request_func_input.prompt_len
146-
147-
ttft = 0.0
148-
st = time.perf_counter()
149-
most_recent_timestamp = st
150-
try:
151-
async with session.post(url=api_url, json=payload) as response:
152-
if response.status == 200:
153-
async for chunk_bytes in response.content:
154-
chunk_bytes = chunk_bytes.strip()
155-
if not chunk_bytes:
156-
continue
157-
158-
chunk = chunk_bytes.decode("utf-8").removeprefix(
159-
"data:")
160-
161-
data = json.loads(chunk)
162-
output.generated_text += data["text_output"]
163-
timestamp = time.perf_counter()
164-
# First token
165-
if ttft == 0.0:
166-
ttft = timestamp - st
167-
output.ttft = ttft
168-
169-
# Decoding phase
170-
else:
171-
output.itl.append(timestamp -
172-
most_recent_timestamp)
173-
174-
most_recent_timestamp = timestamp
175-
176-
output.latency = most_recent_timestamp - st
177-
output.success = True
178-
179-
else:
180-
output.error = response.reason or ""
181-
output.success = False
182-
except Exception:
183-
output.success = False
184-
exc_info = sys.exc_info()
185-
output.error = "".join(traceback.format_exception(*exc_info))
186-
187-
if pbar:
188-
pbar.update(1)
189-
return output
190-
191-
192-
async def async_request_deepspeed_mii(
193-
request_func_input: RequestFuncInput,
194-
pbar: Optional[tqdm] = None,
195-
) -> RequestFuncOutput:
196-
async with aiohttp.ClientSession(trust_env=True,
197-
timeout=AIOHTTP_TIMEOUT) as session:
198-
assert request_func_input.best_of == 1
199-
200-
payload = {
201-
"prompt": request_func_input.prompt,
202-
"max_tokens": request_func_input.output_len,
203-
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
204-
"top_p": 1.0,
205-
}
206-
output = RequestFuncOutput()
207-
output.prompt_len = request_func_input.prompt_len
208-
209-
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
210-
# will use 0 as placeholder.
211-
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
212-
output.ttft = 0
213-
214-
st = time.perf_counter()
215-
try:
216-
async with session.post(url=request_func_input.api_url,
217-
json=payload) as response:
218-
if response.status == 200:
219-
parsed_resp = await response.json()
220-
output.latency = time.perf_counter() - st
221-
output.generated_text = parsed_resp["text"][0]
222-
output.success = True
223-
else:
224-
output.error = response.reason or ""
225-
output.success = False
226-
except Exception:
227-
output.success = False
228-
exc_info = sys.exc_info()
229-
output.error = "".join(traceback.format_exception(*exc_info))
230-
231-
if pbar:
232-
pbar.update(1)
233-
return output
234-
235-
23651
async def async_request_openai_completions(
23752
request_func_input: RequestFuncInput,
23853
pbar: Optional[tqdm] = None,
@@ -332,158 +147,10 @@ async def async_request_openai_completions(
332147
return output
333148

334149

335-
async def async_request_openai_chat_completions(
336-
request_func_input: RequestFuncInput,
337-
pbar: Optional[tqdm] = None,
338-
) -> RequestFuncOutput:
339-
api_url = request_func_input.api_url
340-
assert api_url.endswith(
341-
"chat/completions"
342-
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
343-
344-
async with aiohttp.ClientSession(trust_env=True,
345-
timeout=AIOHTTP_TIMEOUT) as session:
346-
content = [{"type": "text", "text": request_func_input.prompt}]
347-
if request_func_input.multi_modal_content:
348-
content.append(request_func_input.multi_modal_content)
349-
payload = {
350-
"model": request_func_input.model_name \
351-
if request_func_input.model_name else request_func_input.model,
352-
"messages": [
353-
{
354-
"role": "user",
355-
"content": content
356-
},
357-
],
358-
"temperature": 0.0,
359-
"max_completion_tokens": request_func_input.output_len,
360-
"stream": True,
361-
"stream_options": {
362-
"include_usage": True,
363-
},
364-
}
365-
if request_func_input.ignore_eos:
366-
payload["ignore_eos"] = request_func_input.ignore_eos
367-
if request_func_input.extra_body:
368-
payload.update(request_func_input.extra_body)
369-
headers = {
370-
"Content-Type": "application/json",
371-
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
372-
}
373-
374-
output = RequestFuncOutput()
375-
output.prompt_len = request_func_input.prompt_len
376-
377-
generated_text = ""
378-
ttft = 0.0
379-
st = time.perf_counter()
380-
most_recent_timestamp = st
381-
try:
382-
async with session.post(url=api_url, json=payload,
383-
headers=headers) as response:
384-
if response.status == 200:
385-
async for chunk_bytes in response.content:
386-
chunk_bytes = chunk_bytes.strip()
387-
if not chunk_bytes:
388-
continue
389-
390-
chunk = chunk_bytes.decode("utf-8").removeprefix(
391-
"data: ")
392-
if chunk != "[DONE]":
393-
timestamp = time.perf_counter()
394-
data = json.loads(chunk)
395-
396-
if choices := data.get("choices"):
397-
content = choices[0]["delta"].get("content")
398-
# First token
399-
if ttft == 0.0:
400-
ttft = timestamp - st
401-
output.ttft = ttft
402-
403-
# Decoding phase
404-
else:
405-
output.itl.append(timestamp -
406-
most_recent_timestamp)
407-
408-
generated_text += content or ""
409-
elif usage := data.get("usage"):
410-
output.output_tokens = usage.get(
411-
"completion_tokens")
412-
413-
most_recent_timestamp = timestamp
414-
415-
output.generated_text = generated_text
416-
output.success = True
417-
output.latency = most_recent_timestamp - st
418-
else:
419-
output.error = response.reason or ""
420-
output.success = False
421-
except Exception:
422-
output.success = False
423-
exc_info = sys.exc_info()
424-
output.error = "".join(traceback.format_exception(*exc_info))
425-
426-
if pbar:
427-
pbar.update(1)
428-
return output
429-
430-
431-
def get_model(pretrained_model_name_or_path: str) -> str:
432-
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
433-
from modelscope import snapshot_download
434-
435-
# Use file lock to prevent multiple processes from
436-
# downloading the same model weights at the same time.
437-
with get_lock(pretrained_model_name_or_path):
438-
model_path = snapshot_download(
439-
model_id=pretrained_model_name_or_path,
440-
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
441-
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
442-
443-
return model_path
444-
return pretrained_model_name_or_path
445-
446-
447-
def get_tokenizer(
448-
pretrained_model_name_or_path: str,
449-
tokenizer_mode: str = "auto",
450-
trust_remote_code: bool = False,
451-
**kwargs,
452-
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
453-
if pretrained_model_name_or_path is not None and not os.path.exists(
454-
pretrained_model_name_or_path):
455-
pretrained_model_name_or_path = get_model(
456-
pretrained_model_name_or_path)
457-
if tokenizer_mode == "slow":
458-
if kwargs.get("use_fast", False):
459-
raise ValueError(
460-
"Cannot use the fast tokenizer in slow tokenizer mode.")
461-
kwargs["use_fast"] = False
462-
if tokenizer_mode == "mistral":
463-
try:
464-
from vllm.transformers_utils.tokenizer import MistralTokenizer
465-
except ImportError as e:
466-
raise ImportError("MistralTokenizer requires vllm package.\n"
467-
"Please install it with `pip install vllm` "
468-
"to use mistral tokenizer mode.") from e
469-
return MistralTokenizer.from_pretrained(
470-
str(pretrained_model_name_or_path))
471-
else:
472-
return AutoTokenizer.from_pretrained(
473-
pretrained_model_name_or_path,
474-
trust_remote_code=trust_remote_code,
475-
**kwargs,
476-
)
477-
478-
479150
ASYNC_REQUEST_FUNCS = {
480-
"tgi": async_request_tgi,
481151
"vllm": async_request_openai_completions,
482152
"lmdeploy": async_request_openai_completions,
483-
"deepspeed-mii": async_request_deepspeed_mii,
484153
"openai": async_request_openai_completions,
485-
"openai-chat": async_request_openai_chat_completions,
486-
"tensorrt-llm": async_request_trt_llm,
487154
"scalellm": async_request_openai_completions,
488155
"sglang": async_request_openai_completions,
489156
}

vllm/benchmarks/benchmark_utils.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
4949
return records
5050

5151

52-
class InfEncoder(json.JSONEncoder):
53-
54-
def clear_inf(self, o: Any):
55-
if isinstance(o, dict):
56-
return {k: self.clear_inf(v) for k, v in o.items()}
57-
elif isinstance(o, list):
58-
return [self.clear_inf(v) for v in o]
59-
elif isinstance(o, float) and math.isinf(o):
60-
return "inf"
61-
return o
62-
63-
def iterencode(self, o: Any, *args, **kwargs) -> Any:
64-
return super().iterencode(self.clear_inf(o), *args, **kwargs)
65-
66-
6752
def write_to_json(filename: str, records: list) -> None:
6853
with open(filename, "w") as f:
6954
json.dump(records, f, cls=InfEncoder)

0 commit comments

Comments
 (0)