|
10 | 10 | import openai # use the official client for correctness check |
11 | 11 | import pytest |
12 | 12 | import pytest_asyncio |
| 13 | +import requests |
13 | 14 |
|
14 | 15 | from tests.utils import RemoteOpenAIServer |
15 | 16 | from tests.v1.test_utils import check_request_balancing |
@@ -101,6 +102,8 @@ def start_server(sidx: int, r: int, sargs: list[str]): |
101 | 102 | sargs, |
102 | 103 | auto_port=False, |
103 | 104 | env_dict={ |
| 105 | + "VLLM_SERVER_DEV_MODE": |
| 106 | + "1", |
104 | 107 | current_platform.device_control_env_var: |
105 | 108 | ",".join( |
106 | 109 | str( |
@@ -214,7 +217,10 @@ def start_api_server(): |
214 | 217 | self.model_name, |
215 | 218 | api_server_args, |
216 | 219 | auto_port=False, |
217 | | - env_dict={}) # No GPUs needed for API-only server |
| 220 | + env_dict={ |
| 221 | + "VLLM_SERVER_DEV_MODE": "1", |
| 222 | + # No GPUs needed for API-only server |
| 223 | + }) |
218 | 224 | server.__enter__() |
219 | 225 | print(f"API-only server started successfully with " |
220 | 226 | f"{self.api_server_count} API servers") |
@@ -293,14 +299,21 @@ def default_server_args(): |
293 | 299 |
|
294 | 300 |
|
295 | 301 | @pytest.fixture(scope="module", params=[1, 4]) |
296 | | -def servers(request, default_server_args): |
| 302 | +def server_manager(request, default_server_args): |
297 | 303 | api_server_count = request.param |
298 | | - with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, |
299 | | - api_server_count, |
300 | | - default_server_args, |
301 | | - DP_SIZE // NUM_NODES, |
302 | | - TP_SIZE) as server_list: |
303 | | - yield server_list |
| 304 | + server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE, |
| 305 | + api_server_count, |
| 306 | + default_server_args, |
| 307 | + DP_SIZE // NUM_NODES, |
| 308 | + TP_SIZE) |
| 309 | + |
| 310 | + with server_manager: |
| 311 | + yield server_manager |
| 312 | + |
| 313 | + |
| 314 | +@pytest.fixture |
| 315 | +def servers(server_manager): |
| 316 | + return server_manager.servers |
304 | 317 |
|
305 | 318 |
|
306 | 319 | @pytest.fixture(scope="module", params=[1, 4]) |
@@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer, |
331 | 344 | yield client |
332 | 345 |
|
333 | 346 |
|
| 347 | +def _get_parallel_config(server: RemoteOpenAIServer): |
| 348 | + response = requests.get(server.url_for("server_info?config_format=json")) |
| 349 | + response.raise_for_status() |
| 350 | + |
| 351 | + vllm_config = response.json()["vllm_config"] |
| 352 | + return vllm_config["parallel_config"] |
| 353 | + |
| 354 | + |
| 355 | +def test_multinode_dp_server_info(server_manager): |
| 356 | + head_server = server_manager.servers[0][0] |
| 357 | + api_server_count = server_manager.api_server_count |
| 358 | + |
| 359 | + # Each request will hit one of the API servers |
| 360 | + # `n_reqs` is set so that there is a good chance each server |
| 361 | + # receives at least one request |
| 362 | + n_reqs = 2 * api_server_count * api_server_count |
| 363 | + parallel_configs = [ |
| 364 | + _get_parallel_config(head_server) for _ in range(n_reqs) |
| 365 | + ] |
| 366 | + api_process_counts = [c["_api_process_count"] for c in parallel_configs] |
| 367 | + api_process_ranks = [c["_api_process_rank"] for c in parallel_configs] |
| 368 | + |
| 369 | + assert all(c == api_server_count |
| 370 | + for c in api_process_counts), api_process_counts |
| 371 | + assert all(0 <= r < api_server_count |
| 372 | + for r in api_process_ranks), api_process_ranks |
| 373 | + |
| 374 | + |
334 | 375 | @pytest.mark.asyncio |
335 | 376 | @pytest.mark.parametrize( |
336 | 377 | "model_name", |
|
0 commit comments