Skip to content

Commit 47a27fd

Browse files
committed
add minimal_generation to async_llm
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
1 parent 0388807 commit 47a27fd

File tree

3 files changed

+25
-14
lines changed

3 files changed

+25
-14
lines changed

vllm/engine/protocol.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ def generate(
5959
"""Generate outputs for a request."""
6060
...
6161

62+
@abstractmethod
63+
def minimal_generation(self) -> str:
64+
"""Generate outputs for a minimal spec prompt"""
65+
...
66+
6267
async def beam_search(
6368
self,
6469
prompt: PromptType,

vllm/entrypoints/openai/api_server.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
import pydantic
2424
import regex as re
2525
import uvloop
26-
from fastapi import (APIRouter, Depends, FastAPI, Form, HTTPException, Query,
27-
Request)
26+
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
2827
from fastapi.exceptions import RequestValidationError
2928
from fastapi.middleware.cors import CORSMiddleware
3029
from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -99,13 +98,13 @@
9998
log_non_default_args, with_cancellation)
10099
from vllm.logger import init_logger
101100
from vllm.reasoning import ReasoningParserManager
102-
from vllm.sampling_params import SamplingParams
103101
from vllm.transformers_utils.config import (
104102
maybe_register_config_serialize_by_value)
105103
from vllm.transformers_utils.tokenizer import MistralTokenizer
106104
from vllm.usage.usage_lib import UsageContext
107105
from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
108-
is_valid_ipv6_address, random_uuid, set_ulimit)
106+
get_open_zmq_ipc_path,is_valid_ipv6_address, random_uuid,
107+
set_ulimit)
109108
from vllm.v1.engine.exceptions import EngineDeadError
110109
from vllm.v1.metrics.prometheus import get_prometheus_registry
111110
from vllm.version import __version__ as VLLM_VERSION
@@ -344,18 +343,13 @@ def engine_client(request: Request) -> EngineClient:
344343

345344

346345
@router.get("/health", response_class=Response)
347-
async def health(
348-
raw_request: Request, generate: Optional[bool] = Query(False)) -> Response:
346+
async def health(raw_request: Request) -> Response:
349347
"""Health check."""
350348
try:
351349
await engine_client(raw_request).check_health()
352-
if generate:
353-
prompt = "Hi"
354-
sampling_params = SamplingParams(temperature=0, max_tokens=2)
355-
request_id = random_uuid()
356-
async for _ in engine_client(raw_request).generate(prompt, sampling_params,
357-
request_id):
358-
pass
350+
generate_str = raw_request.query_params.get("generate")
351+
if generate_str == "true":
352+
await engine_client(raw_request).minimal_generation()
359353
return Response(status_code=200)
360354
except EngineDeadError:
361355
return Response(status_code=503)

vllm/v1/engine/async_llm.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
init_tokenizer_from_configs)
3434
from vllm.usage.usage_lib import UsageContext
3535
from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
36-
deprecate_kwargs)
36+
deprecate_kwargs, random_uuid)
3737
from vllm.v1.engine import EngineCoreRequest
3838
from vllm.v1.engine.core_client import EngineCoreClient
3939
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -724,6 +724,18 @@ async def scale_elastic_ep(self,
724724
custom_stat_loggers=None,
725725
)
726726

727+
async def minimal_generation(self) -> str:
728+
prompt = "Hi"
729+
sampling_params = SamplingParams(temperature=0, max_tokens=2)
730+
request_id = random_uuid()
731+
result_text = ""
732+
async for output in self.generate(prompt, sampling_params, request_id):
733+
for completion in output.outputs:
734+
result_text = completion.text
735+
if output.finished:
736+
break
737+
return result_text
738+
727739
@property
728740
def is_running(self) -> bool:
729741
# Is None before the loop is started.

0 commit comments

Comments
 (0)