Merge branch 'main' into ryan/streamtool

ayushag-nv · web-flow · commit 2b4c77e832b6 · 2025-09-22T15:13:10.000-07:00
diff --git a/components/backends/vllm/src/dynamo/vllm/health_check.py b/components/backends/vllm/src/dynamo/vllm/health_check.py
@@ -7,8 +7,43 @@
 This module defines the default health check payload for vLLM backends.
 """
 
+import logging
+
 from dynamo.health_check import HealthCheckPayload
 
+logger = logging.getLogger(__name__)
+
+
+def _get_bos_token_id_from_engine(engine_client) -> int:
+    """
+    Extract BOS token ID from the vLLM engine client's tokenizer if available.
+
+    Args:
+        engine_client: vLLM AsyncLLM engine client
+
+    Returns:
+        BOS token ID from the model's tokenizer, or 1 as fallback
+    """
+    if engine_client is None:
+        return 1
+
+    try:
+        tokenizer_group = getattr(engine_client, "tokenizer", None)
+        if tokenizer_group:
+            tokenizer = getattr(tokenizer_group, "tokenizer", None)
+            if tokenizer:
+                bos_token_id = getattr(tokenizer, "bos_token_id", None)
+                if bos_token_id is not None:
+                    logger.info(
+                        f"Using model's BOS token ID for health check: {bos_token_id}"
+                    )
+                    return int(bos_token_id)
+    except Exception as e:
+        logger.debug(f"Failed to get BOS token from engine: {e}")
+
+    logger.debug("Using default BOS token ID (1) for health check")
+    return 1
+
 
 class VllmHealthCheckPayload(HealthCheckPayload):
     """
@@ -17,14 +52,20 @@ class VllmHealthCheckPayload(HealthCheckPayload):
     Provides vLLM defaults and inherits environment override support from base class.
     """
 
-    def __init__(self):
+    def __init__(self, engine_client=None):
         """
         Initialize vLLM health check payload with vLLM-specific defaults.
+
+        Args:
+            engine_client: Optional vLLM AsyncLLM engine client to extract BOS token from.
+                          If provided, will attempt to use the model's actual BOS token.
         """
+        bos_token_id = _get_bos_token_id_from_engine(engine_client)
+
         # Set vLLM default payload - minimal request that completes quickly
         # The handler expects token_ids, sampling_options, and stop_conditions
         self.default_payload = {
-            "token_ids": [1],  # Single token for minimal processing
+            "token_ids": [bos_token_id],
             "sampling_options": {
                 "max_tokens": 1,
                 "temperature": 0.0,
@@ -38,3 +79,44 @@ def __init__(self):
             },
         }
         super().__init__()
+
+
+class VllmPrefillHealthCheckPayload(HealthCheckPayload):
+    """
+    vLLM-specific health check payload for prefill workers in disaggregated mode.
+
+    The prefill handler expects a different structure with 'request_id' and 'sampling_params'.
+    """
+
+    def __init__(self, engine_client=None):
+        """
+        Initialize vLLM prefill health check payload with proper structure.
+
+        Args:
+            engine_client: Optional vLLM AsyncLLM engine client to extract BOS token from.
+                          If provided, will attempt to use the model's actual BOS token.
+        """
+        bos_token_id = _get_bos_token_id_from_engine(engine_client)
+
+        # Prefill handler expects request_id, token_ids, and sampling_params
+        # The sampling_params are converted via msgspec in the handler
+        self.default_payload = {
+            "request_id": "health_check",
+            "token_ids": [bos_token_id],
+            "sampling_params": {
+                "max_tokens": 1,
+                "min_tokens": 1,
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "detokenize": False,
+                "include_stop_str_in_output": False,
+                "ignore_eos": False,
+                "extra_args": {
+                    "kv_transfer_params": {
+                        "do_remote_decode": True,
+                    }
+                },
+            },
+        }
+        super().__init__()
diff --git a/components/backends/vllm/src/dynamo/vllm/main.py b/components/backends/vllm/src/dynamo/vllm/main.py
@@ -24,7 +24,7 @@
 
 from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
-from .health_check import VllmHealthCheckPayload
+from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
 from .publisher import StatLoggerFactory
 
 configure_dynamo_logging()
@@ -145,8 +145,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
         runtime, component, engine_client, default_sampling_params
     )
 
-    # Get health check payload (checks env var and falls back to vLLM default)
-    health_check_payload = VllmHealthCheckPayload().to_dict()
+    health_check_payload = VllmPrefillHealthCheckPayload(engine_client).to_dict()
 
     try:
         logger.debug("Starting serve_endpoint for prefill worker")
@@ -261,8 +260,7 @@ async def init(runtime: DistributedRuntime, config: Config):
             custom_template_path=config.custom_jinja_template,
         )
 
-    # Get health check payload (checks env var and falls back to vLLM default)
-    health_check_payload = VllmHealthCheckPayload().to_dict()
+    health_check_payload = VllmHealthCheckPayload(engine_client).to_dict()
 
     try:
         logger.debug("Starting serve_endpoint for decode worker")
diff --git a/docs/guides/health_check.md b/docs/guides/health_check.md
@@ -18,13 +18,13 @@ the service is running.
 
 > **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself.
 
-#### Example Request
+### Example Request
 
 ```
 curl -s localhost:8080/live -q | jq
 ```
 
-#### Example Response
+### Example Response
 
 ```
 {
@@ -41,13 +41,13 @@ the service is running.  Once workers have been registered, the
 
 > **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself.
 
-#### Example Request
+### Example Request
 
 ```
 curl -v localhost:8080/health -q | jq
 ```
 
-#### Example Response
+### Example Response
 
 Before workers are registered:
 
diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst
@@ -24,20 +24,24 @@
    API/nixl_connect/write_operation.md
    API/nixl_connect/README.md
 
+   guides/dynamo_deploy/api_reference.md
    guides/dynamo_deploy/create_deployment.md
-   guides/dynamo_deploy/sla_planner_deployment.md
+
+   guides/dynamo_deploy/fluxcd.md
    guides/dynamo_deploy/gke_setup.md
    guides/dynamo_deploy/grove.md
-   guides/dynamo_deploy/k8s_metrics.md
    guides/dynamo_deploy/model_caching_with_fluid.md
    guides/dynamo_deploy/README.md
    guides/dynamo_run.md
+   guides/dynamo_deploy/sla_planner_deployment.md
    guides/metrics.md
    guides/run_kvbm_in_vllm.md
+   guides/run_kvbm_in_trtllm.md
 
    architecture/kv_cache_routing.md
    architecture/load_planner.md
    architecture/request_migration.md
+   architecture/request_cancellation.md
 
    components/backends/trtllm/multinode/multinode-examples.md
    components/backends/sglang/docs/multinode-examples.md
diff --git a/docs/index.rst b/docs/index.rst
@@ -51,6 +51,7 @@ Quickstart
    :caption: Kubernetes Deployment
 
    Quickstart (K8s) <../guides/dynamo_deploy/README.md>
+   Detailed Installation Guide <../guides/dynamo_deploy/installation_guide.md>
    Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md>
    Metrics <../guides/dynamo_deploy/metrics.md>
    Logging <../guides/dynamo_deploy/logging.md>
@@ -70,6 +71,10 @@ Quickstart
    :hidden:
    :caption: Developer Guide
 
+   Benchmarking Guide <benchmarks/benchmarking.md>
+   Planner Benchmark Example <guides/planner_benchmark/README.md>
+   Logging <guides/logging.md>
+   Health Checks <guides/health_check.md>
    Tuning Disaggregated Serving Performance <guides/disagg_perf_tuning.md>
    Writing Python Workers in Dynamo <guides/backend.md>
    Glossary <dynamo_glossary.md>