Skip to content

Commit 345bbf7

Browse files
committed
add check running queue length and waiting queue length
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
1 parent 47a27fd commit 345bbf7

File tree

5 files changed

+22
-6
lines changed

5 files changed

+22
-6
lines changed

vllm/engine/protocol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def generate(
6060
...
6161

6262
@abstractmethod
63-
def minimal_generation(self) -> str:
63+
async def minimal_generation(self) -> str:
6464
"""Generate outputs for a minimal spec prompt"""
6565
...
6666

vllm/entrypoints/openai/api_server.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,10 @@
9898
log_non_default_args, with_cancellation)
9999
from vllm.logger import init_logger
100100
from vllm.reasoning import ReasoningParserManager
101-
from vllm.transformers_utils.config import (
102-
maybe_register_config_serialize_by_value)
103101
from vllm.transformers_utils.tokenizer import MistralTokenizer
104102
from vllm.usage.usage_lib import UsageContext
105103
from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
106-
get_open_zmq_ipc_path,is_valid_ipv6_address, random_uuid,
107-
set_ulimit)
104+
is_valid_ipv6_address, set_ulimit)
108105
from vllm.v1.engine.exceptions import EngineDeadError
109106
from vllm.v1.metrics.prometheus import get_prometheus_registry
110107
from vllm.version import __version__ as VLLM_VERSION

vllm/v1/engine/async_llm.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,10 +725,14 @@ async def scale_elastic_ep(self,
725725
)
726726

727727
async def minimal_generation(self) -> str:
728-
prompt = "Hi"
728+
prompt = "Ping"
729729
sampling_params = SamplingParams(temperature=0, max_tokens=2)
730730
request_id = random_uuid()
731731
result_text = ""
732+
count = await self.engine_core.get_request_count()
733+
num_running_reqs, num_waiting_reqs = count[0], count[1]
734+
if num_running_reqs > 0 or num_waiting_reqs > 0:
735+
return result_text
732736
async for output in self.generate(prompt, sampling_params, request_id):
733737
for completion in output.outputs:
734738
result_text = completion.text

vllm/v1/engine/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,9 @@ def preprocess_add_request(
449449
self.structured_output_manager.grammar_init(req)
450450
return req, request.current_wave
451451

452+
def get_request_count(self) -> tuple[int, int]:
453+
return self.scheduler.get_request_counts()
454+
452455

453456
class EngineCoreProc(EngineCore):
454457
"""ZMQ-wrapper for running EngineCore in background process."""

vllm/v1/engine/core_client.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ async def collective_rpc_async(
230230
kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
231231
raise NotImplementedError
232232

233+
async def get_request_count(self) -> tuple[int, int]:
234+
raise NotImplementedError
235+
233236

234237
class InprocClient(EngineCoreClient):
235238
"""
@@ -311,6 +314,9 @@ def collective_rpc(self,
311314
def dp_engines_running(self) -> bool:
312315
return False
313316

317+
async def get_request_count(self) -> tuple[int, int]:
318+
return self.engine_core.get_request_count()
319+
314320

315321
@dataclass
316322
class BackgroundResources:
@@ -755,6 +761,9 @@ def save_sharded_state(self,
755761
max_size: Optional[int] = None) -> None:
756762
self.call_utility("save_sharded_state", path, pattern, max_size)
757763

764+
async def get_request_count(self) -> tuple[int, int]:
765+
return self.call_utility("get_request_count")
766+
758767

759768
class AsyncMPClient(MPClient):
760769
"""Asyncio-compatible client for multi-proc EngineCore."""
@@ -958,6 +967,9 @@ async def collective_rpc_async(
958967
return await self.call_utility_async("collective_rpc", method, timeout,
959968
args, kwargs)
960969

970+
async def get_request_count(self) -> tuple[int, int]:
971+
return await self.call_utility_async("get_request_count")
972+
961973

962974
class DPAsyncMPClient(AsyncMPClient):
963975
"""Asyncio-compatible client for multi-proc, multi-engine (data parallel)

0 commit comments

Comments
 (0)