Skip to content

Commit 14cc317

Browse files
OpenAI Server refactoring (#2360)
1 parent e1957c6 commit 14cc317

File tree

8 files changed

+954
-643
lines changed

8 files changed

+954
-643
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ steps:
1919
- label: Engine Test
2020
command: pytest -v -s engine
2121

22+
- label: Entrypoints Test
23+
command: pytest -v -s entrypoints
24+
2225
- label: Kernels Test
2326
command: pytest -v -s kernels
2427
soft_fail: true

requirements-dev.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,6 @@ pytest-asyncio
1616
httpx
1717
einops # required for MPT
1818
flash_attn # required for HuggingFace's llama implementation
19+
openai
20+
requests
21+
ray

tests/async_engine/test_openai_server.py renamed to tests/async_engine/test_chat_template.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from argparse import Namespace
21
from dataclasses import dataclass
32
import os
43
import pathlib
54

65
import pytest
7-
from fastapi.testclient import TestClient
86

9-
from vllm.entrypoints.openai.api_server import *
7+
from vllm.transformers_utils.tokenizer import get_tokenizer
8+
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
9+
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
1010

1111
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
1212
__file__))).parent.parent / "examples/template_chatml.jinja"
@@ -48,21 +48,24 @@
4848
'content': 'What is the capital of'
4949
},
5050
]
51-
client = TestClient(app)
5251

5352

5453
@dataclass
5554
class MockTokenizer:
5655
chat_template = None
5756

5857

58+
@dataclass
59+
class MockServingChat:
60+
tokenizer: MockTokenizer
61+
62+
5963
def test_load_chat_template():
6064
# Testing chatml template
61-
mock_args = Namespace(chat_template=chatml_jinja_path)
6265
tokenizer = MockTokenizer()
63-
64-
# Call the function with the mocked args
65-
load_chat_template(mock_args, tokenizer)
66+
mock_serving_chat = MockServingChat(tokenizer)
67+
OpenAIServingChat._load_chat_template(mock_serving_chat,
68+
chat_template=chatml_jinja_path)
6669

6770
template_content = tokenizer.chat_template
6871

@@ -76,11 +79,11 @@ def test_load_chat_template():
7679
def test_no_load_chat_template():
7780
# Testing chatml template
7881
template = "../../examples/does_not_exist"
79-
mock_args = Namespace(chat_template=template)
8082
tokenizer = MockTokenizer()
8183

82-
# Call the function with the mocked args
83-
load_chat_template(mock_args, tokenizer=tokenizer)
84+
mock_serving_chat = MockServingChat(tokenizer)
85+
OpenAIServingChat._load_chat_template(mock_serving_chat,
86+
chat_template=template)
8487
template_content = tokenizer.chat_template
8588

8689
# Test assertions
@@ -97,9 +100,9 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
97100
expected_output):
98101
# Initialize the tokenizer
99102
tokenizer = get_tokenizer(tokenizer_name=model)
100-
101-
mock_args = Namespace(chat_template=template)
102-
load_chat_template(mock_args, tokenizer)
103+
mock_serving_chat = MockServingChat(tokenizer)
104+
OpenAIServingChat._load_chat_template(mock_serving_chat,
105+
chat_template=template)
103106

104107
# Create a mock request object using keyword arguments
105108
mock_request = ChatCompletionRequest(
@@ -115,8 +118,3 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
115118

116119
# Test assertion
117120
assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
118-
119-
120-
def test_health_endpoint():
121-
response = client.get("/health")
122-
assert response.status_code == 200
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import time
2+
import subprocess
3+
4+
import sys
5+
import pytest
6+
import requests
7+
import ray # using Ray for overall ease of process management, parallel requests, and debugging.
8+
import openai # use the official client for correctness check
9+
10+
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
11+
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here
12+
13+
pytestmark = pytest.mark.asyncio
14+
15+
16+
@ray.remote(num_gpus=1)
17+
class ServerRunner:
18+
19+
def __init__(self, args):
20+
self.proc = subprocess.Popen(
21+
["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
22+
stdout=sys.stdout,
23+
stderr=sys.stderr,
24+
)
25+
self._wait_for_server()
26+
27+
def ready(self):
28+
return True
29+
30+
def _wait_for_server(self):
31+
# run health check
32+
start = time.time()
33+
while True:
34+
try:
35+
if requests.get(
36+
"http://localhost:8000/health").status_code == 200:
37+
break
38+
except Exception as err:
39+
if self.proc.poll() is not None:
40+
raise RuntimeError("Server exited unexpectedly.") from err
41+
42+
time.sleep(0.5)
43+
if time.time() - start > MAX_SERVER_START_WAIT_S:
44+
raise RuntimeError(
45+
"Server failed to start in time.") from err
46+
47+
def __del__(self):
48+
if hasattr(self, "proc"):
49+
self.proc.terminate()
50+
51+
52+
@pytest.fixture(scope="session")
53+
def server():
54+
ray.init()
55+
server_runner = ServerRunner.remote([
56+
"--model",
57+
MODEL_NAME,
58+
"--dtype",
59+
"bfloat16", # use half precision for speed and memory savings in CI environment
60+
"--max-model-len",
61+
"8192"
62+
])
63+
ray.get(server_runner.ready.remote())
64+
yield server_runner
65+
ray.shutdown()
66+
67+
68+
@pytest.fixture(scope="session")
69+
def client():
70+
client = openai.AsyncOpenAI(
71+
base_url="http://localhost:8000/v1",
72+
api_key="token-abc123",
73+
)
74+
yield client
75+
76+
77+
async def test_single_completion(server, client: openai.AsyncOpenAI):
78+
completion = await client.completions.create(model=MODEL_NAME,
79+
prompt="Hello, my name is",
80+
max_tokens=5,
81+
temperature=0.0)
82+
83+
assert completion.id is not None
84+
assert completion.choices is not None and len(completion.choices) == 1
85+
assert completion.choices[0].text is not None and len(
86+
completion.choices[0].text) >= 5
87+
assert completion.choices[0].finish_reason == "length"
88+
assert completion.usage == openai.types.CompletionUsage(
89+
completion_tokens=5, prompt_tokens=6, total_tokens=11)
90+
91+
92+
async def test_single_chat_session(server, client: openai.AsyncOpenAI):
93+
messages = [{
94+
"role": "system",
95+
"content": "you are a helpful assistant"
96+
}, {
97+
"role": "user",
98+
"content": "what is 1+1?"
99+
}]
100+
101+
# test single completion
102+
chat_completion = await client.chat.completions.create(
103+
model=MODEL_NAME,
104+
messages=messages,
105+
max_tokens=10,
106+
)
107+
assert chat_completion.id is not None
108+
assert chat_completion.choices is not None and len(
109+
chat_completion.choices) == 1
110+
assert chat_completion.choices[0].message is not None
111+
message = chat_completion.choices[0].message
112+
assert message.content is not None and len(message.content) >= 10
113+
assert message.role == "assistant"
114+
messages.append({"role": "assistant", "content": message.content})
115+
116+
# test multi-turn dialogue
117+
messages.append({"role": "user", "content": "express your result in json"})
118+
chat_completion = await client.chat.completions.create(
119+
model=MODEL_NAME,
120+
messages=messages,
121+
max_tokens=10,
122+
)
123+
message = chat_completion.choices[0].message
124+
assert message.content is not None and len(message.content) >= 0
125+
126+
127+
async def test_completion_streaming(server, client: openai.AsyncOpenAI):
128+
prompt = "What is an LLM?"
129+
130+
single_completion = await client.completions.create(
131+
model=MODEL_NAME,
132+
prompt=prompt,
133+
max_tokens=5,
134+
temperature=0.0,
135+
)
136+
single_output = single_completion.choices[0].text
137+
single_usage = single_completion.usage
138+
139+
stream = await client.completions.create(
140+
model=MODEL_NAME,
141+
prompt=prompt,
142+
max_tokens=5,
143+
temperature=0.0,
144+
stream=True,
145+
)
146+
chunks = []
147+
async for chunk in stream:
148+
chunks.append(chunk.choices[0].text)
149+
assert chunk.choices[0].finish_reason == "length"
150+
assert chunk.usage == single_usage
151+
assert "".join(chunks) == single_output
152+
153+
154+
async def test_chat_streaming(server, client: openai.AsyncOpenAI):
155+
messages = [{
156+
"role": "system",
157+
"content": "you are a helpful assistant"
158+
}, {
159+
"role": "user",
160+
"content": "what is 1+1?"
161+
}]
162+
163+
# test single completion
164+
chat_completion = await client.chat.completions.create(
165+
model=MODEL_NAME,
166+
messages=messages,
167+
max_tokens=10,
168+
temperature=0.0,
169+
)
170+
output = chat_completion.choices[0].message.content
171+
stop_reason = chat_completion.choices[0].finish_reason
172+
173+
# test streaming
174+
stream = await client.chat.completions.create(
175+
model=MODEL_NAME,
176+
messages=messages,
177+
max_tokens=10,
178+
temperature=0.0,
179+
stream=True,
180+
)
181+
chunks = []
182+
async for chunk in stream:
183+
delta = chunk.choices[0].delta
184+
if delta.role:
185+
assert delta.role == "assistant"
186+
if delta.content:
187+
chunks.append(delta.content)
188+
assert chunk.choices[0].finish_reason == stop_reason
189+
assert "".join(chunks) == output
190+
191+
192+
if __name__ == "__main__":
193+
pytest.main([__file__])

0 commit comments

Comments
 (0)