Skip to content

Commit 2b4c77e

Browse files
authored
Merge branch 'main' into ryan/streamtool
2 parents d87a910 + 5fc0bf9 commit 2b4c77e

File tree

5 files changed

+102
-13
lines changed

5 files changed

+102
-13
lines changed

components/backends/vllm/src/dynamo/vllm/health_check.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,43 @@
77
This module defines the default health check payload for vLLM backends.
88
"""
99

10+
import logging
11+
1012
from dynamo.health_check import HealthCheckPayload
1113

14+
logger = logging.getLogger(__name__)
15+
16+
17+
def _get_bos_token_id_from_engine(engine_client) -> int:
18+
"""
19+
Extract BOS token ID from the vLLM engine client's tokenizer if available.
20+
21+
Args:
22+
engine_client: vLLM AsyncLLM engine client
23+
24+
Returns:
25+
BOS token ID from the model's tokenizer, or 1 as fallback
26+
"""
27+
if engine_client is None:
28+
return 1
29+
30+
try:
31+
tokenizer_group = getattr(engine_client, "tokenizer", None)
32+
if tokenizer_group:
33+
tokenizer = getattr(tokenizer_group, "tokenizer", None)
34+
if tokenizer:
35+
bos_token_id = getattr(tokenizer, "bos_token_id", None)
36+
if bos_token_id is not None:
37+
logger.info(
38+
f"Using model's BOS token ID for health check: {bos_token_id}"
39+
)
40+
return int(bos_token_id)
41+
except Exception as e:
42+
logger.debug(f"Failed to get BOS token from engine: {e}")
43+
44+
logger.debug("Using default BOS token ID (1) for health check")
45+
return 1
46+
1247

1348
class VllmHealthCheckPayload(HealthCheckPayload):
1449
"""
@@ -17,14 +52,20 @@ class VllmHealthCheckPayload(HealthCheckPayload):
1752
Provides vLLM defaults and inherits environment override support from base class.
1853
"""
1954

20-
def __init__(self):
55+
def __init__(self, engine_client=None):
2156
"""
2257
Initialize vLLM health check payload with vLLM-specific defaults.
58+
59+
Args:
60+
engine_client: Optional vLLM AsyncLLM engine client to extract BOS token from.
61+
If provided, will attempt to use the model's actual BOS token.
2362
"""
63+
bos_token_id = _get_bos_token_id_from_engine(engine_client)
64+
2465
# Set vLLM default payload - minimal request that completes quickly
2566
# The handler expects token_ids, sampling_options, and stop_conditions
2667
self.default_payload = {
27-
"token_ids": [1], # Single token for minimal processing
68+
"token_ids": [bos_token_id],
2869
"sampling_options": {
2970
"max_tokens": 1,
3071
"temperature": 0.0,
@@ -38,3 +79,44 @@ def __init__(self):
3879
},
3980
}
4081
super().__init__()
82+
83+
84+
class VllmPrefillHealthCheckPayload(HealthCheckPayload):
85+
"""
86+
vLLM-specific health check payload for prefill workers in disaggregated mode.
87+
88+
The prefill handler expects a different structure with 'request_id' and 'sampling_params'.
89+
"""
90+
91+
def __init__(self, engine_client=None):
92+
"""
93+
Initialize vLLM prefill health check payload with proper structure.
94+
95+
Args:
96+
engine_client: Optional vLLM AsyncLLM engine client to extract BOS token from.
97+
If provided, will attempt to use the model's actual BOS token.
98+
"""
99+
bos_token_id = _get_bos_token_id_from_engine(engine_client)
100+
101+
# Prefill handler expects request_id, token_ids, and sampling_params
102+
# The sampling_params are converted via msgspec in the handler
103+
self.default_payload = {
104+
"request_id": "health_check",
105+
"token_ids": [bos_token_id],
106+
"sampling_params": {
107+
"max_tokens": 1,
108+
"min_tokens": 1,
109+
"temperature": 0.0,
110+
"top_p": 1.0,
111+
"top_k": -1,
112+
"detokenize": False,
113+
"include_stop_str_in_output": False,
114+
"ignore_eos": False,
115+
"extra_args": {
116+
"kv_transfer_params": {
117+
"do_remote_decode": True,
118+
}
119+
},
120+
},
121+
}
122+
super().__init__()

components/backends/vllm/src/dynamo/vllm/main.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args
2626
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
27-
from .health_check import VllmHealthCheckPayload
27+
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
2828
from .publisher import StatLoggerFactory
2929

3030
configure_dynamo_logging()
@@ -145,8 +145,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
145145
runtime, component, engine_client, default_sampling_params
146146
)
147147

148-
# Get health check payload (checks env var and falls back to vLLM default)
149-
health_check_payload = VllmHealthCheckPayload().to_dict()
148+
health_check_payload = VllmPrefillHealthCheckPayload(engine_client).to_dict()
150149

151150
try:
152151
logger.debug("Starting serve_endpoint for prefill worker")
@@ -261,8 +260,7 @@ async def init(runtime: DistributedRuntime, config: Config):
261260
custom_template_path=config.custom_jinja_template,
262261
)
263262

264-
# Get health check payload (checks env var and falls back to vLLM default)
265-
health_check_payload = VllmHealthCheckPayload().to_dict()
263+
health_check_payload = VllmHealthCheckPayload(engine_client).to_dict()
266264

267265
try:
268266
logger.debug("Starting serve_endpoint for decode worker")

docs/guides/health_check.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ the service is running.
1818

1919
> **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself.
2020
21-
#### Example Request
21+
### Example Request
2222

2323
```
2424
curl -s localhost:8080/live -q | jq
2525
```
2626

27-
#### Example Response
27+
### Example Response
2828

2929
```
3030
{
@@ -41,13 +41,13 @@ the service is running. Once workers have been registered, the
4141

4242
> **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself.
4343
44-
#### Example Request
44+
### Example Request
4545

4646
```
4747
curl -v localhost:8080/health -q | jq
4848
```
4949

50-
#### Example Response
50+
### Example Response
5151

5252
Before workers are registered:
5353

docs/hidden_toctree.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,24 @@
2424
API/nixl_connect/write_operation.md
2525
API/nixl_connect/README.md
2626

27+
guides/dynamo_deploy/api_reference.md
2728
guides/dynamo_deploy/create_deployment.md
28-
guides/dynamo_deploy/sla_planner_deployment.md
29+
30+
guides/dynamo_deploy/fluxcd.md
2931
guides/dynamo_deploy/gke_setup.md
3032
guides/dynamo_deploy/grove.md
31-
guides/dynamo_deploy/k8s_metrics.md
3233
guides/dynamo_deploy/model_caching_with_fluid.md
3334
guides/dynamo_deploy/README.md
3435
guides/dynamo_run.md
36+
guides/dynamo_deploy/sla_planner_deployment.md
3537
guides/metrics.md
3638
guides/run_kvbm_in_vllm.md
39+
guides/run_kvbm_in_trtllm.md
3740

3841
architecture/kv_cache_routing.md
3942
architecture/load_planner.md
4043
architecture/request_migration.md
44+
architecture/request_cancellation.md
4145

4246
components/backends/trtllm/multinode/multinode-examples.md
4347
components/backends/sglang/docs/multinode-examples.md

docs/index.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Quickstart
5151
:caption: Kubernetes Deployment
5252

5353
Quickstart (K8s) <../guides/dynamo_deploy/README.md>
54+
Detailed Installation Guide <../guides/dynamo_deploy/installation_guide.md>
5455
Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md>
5556
Metrics <../guides/dynamo_deploy/metrics.md>
5657
Logging <../guides/dynamo_deploy/logging.md>
@@ -70,6 +71,10 @@ Quickstart
7071
:hidden:
7172
:caption: Developer Guide
7273

74+
Benchmarking Guide <benchmarks/benchmarking.md>
75+
Planner Benchmark Example <guides/planner_benchmark/README.md>
76+
Logging <guides/logging.md>
77+
Health Checks <guides/health_check.md>
7378
Tuning Disaggregated Serving Performance <guides/disagg_perf_tuning.md>
7479
Writing Python Workers in Dynamo <guides/backend.md>
7580
Glossary <dynamo_glossary.md>

0 commit comments

Comments
 (0)